aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
-rw-r--r--sys/contrib/openzfs/module/zfs/Makefile.in157
-rw-r--r--sys/contrib/openzfs/module/zfs/abd.c204
-rw-r--r--sys/contrib/openzfs/module/zfs/aggsum.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c3124
-rw-r--r--sys/contrib/openzfs/module/zfs/blake3_zfs.c120
-rw-r--r--sys/contrib/openzfs/module/zfs/blkptr.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/bplist.c10
-rw-r--r--sys/contrib/openzfs/module/zfs/bpobj.c134
-rw-r--r--sys/contrib/openzfs/module/zfs/bptree.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/bqueue.c122
-rw-r--r--sys/contrib/openzfs/module/zfs/brt.c1673
-rw-r--r--sys/contrib/openzfs/module/zfs/btree.c847
-rw-r--r--sys/contrib/openzfs/module/zfs/dataset_kstats.c74
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c1238
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf_stats.c17
-rw-r--r--sys/contrib/openzfs/module/zfs/ddt.c674
-rw-r--r--sys/contrib/openzfs/module/zfs/ddt_stats.c212
-rw-r--r--sys/contrib/openzfs/module/zfs/ddt_zap.c151
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu.c521
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_diff.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_object.c16
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_objset.c260
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_recv.c596
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_redact.c46
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_send.c152
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_traverse.c58
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_tx.c237
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_zfetch.c488
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode.c318
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode_sync.c68
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_bookmark.c130
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_crypt.c148
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_dataset.c557
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_deadlist.c145
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_deleg.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_destroy.c49
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_dir.c141
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_pool.c137
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_prop.c117
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_scan.c1433
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_synctask.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_userhold.c10
-rw-r--r--sys/contrib/openzfs/module/zfs/edonr_zfs.c20
-rw-r--r--sys/contrib/openzfs/module/zfs/fm.c45
-rw-r--r--sys/contrib/openzfs/module/zfs/gzip.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/hkdf.c16
-rw-r--r--sys/contrib/openzfs/module/zfs/lz4.c1769
-rw-r--r--sys/contrib/openzfs/module/zfs/lz4_zfs.c935
-rw-r--r--sys/contrib/openzfs/module/zfs/lzjb.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c608
-rw-r--r--sys/contrib/openzfs/module/zfs/mmp.c39
-rw-r--r--sys/contrib/openzfs/module/zfs/multilist.c38
-rw-r--r--sys/contrib/openzfs/module/zfs/pathname.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/range_tree.c110
-rw-r--r--sys/contrib/openzfs/module/zfs/refcount.c217
-rw-r--r--sys/contrib/openzfs/module/zfs/rrwlock.c24
-rw-r--r--sys/contrib/openzfs/module/zfs/sa.c64
-rw-r--r--sys/contrib/openzfs/module/zfs/sha2_zfs.c (renamed from sys/contrib/openzfs/module/zfs/sha256.c)21
-rw-r--r--sys/contrib/openzfs/module/zfs/skein_zfs.c14
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c1806
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_checkpoint.c23
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_config.c47
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_errlog.c1210
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_history.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_log_spacemap.c327
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c350
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_stats.c76
-rw-r--r--sys/contrib/openzfs/module/zfs/space_map.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/space_reftree.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/txg.c46
-rw-r--r--sys/contrib/openzfs/module/zfs/uberblock.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/unique.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c1361
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_cache.c437
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid.c127
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect.c74
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect_births.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_initialize.c122
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c202
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_mirror.c174
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_missing.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_queue.c454
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c2638
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math.c48
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h164
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h46
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_rebuild.c82
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_removal.c325
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_root.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_trim.c182
-rw-r--r--sys/contrib/openzfs/module/zfs/zap.c465
-rw-r--r--sys/contrib/openzfs/module/zfs/zap_leaf.c96
-rw-r--r--sys/contrib/openzfs/module/zfs/zap_micro.c489
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp.c51
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_get.c52
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_iter.c46
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp_synctask.c91
-rw-r--r--sys/contrib/openzfs/module/zfs/zfeature.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_byteswap.c15
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_chksum.c379
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_fm.c201
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_fuid.c34
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_impl.c (renamed from sys/contrib/openzfs/module/zfs/spa_boot.c)53
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c683
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_log.c273
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_onexit.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_quota.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ratelimit.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_replay.c337
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_rlock.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_sa.c44
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_vnops.c936
-rw-r--r--sys/contrib/openzfs/module/zfs/zil.c2067
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c890
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_checksum.c51
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_compress.c17
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_inject.c162
-rw-r--r--sys/contrib/openzfs/module/zfs/zle.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zrlock.c10
-rw-r--r--sys/contrib/openzfs/module/zfs/zthr.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c328
132 files changed, 25472 insertions, 11066 deletions
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in
deleted file mode 100644
index 653ea0da9bcc..000000000000
--- a/sys/contrib/openzfs/module/zfs/Makefile.in
+++ /dev/null
@@ -1,157 +0,0 @@
-ifneq ($(KBUILD_EXTMOD),)
-src = @abs_srcdir@
-obj = @abs_builddir@
-mfdir = $(obj)
-else
-mfdir = $(srctree)/$(src)
-endif
-
-MODULE := zfs
-
-obj-$(CONFIG_ZFS) := $(MODULE).o
-
-# Suppress unused-value warnings in sparc64 architecture headers
-ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
-
-$(MODULE)-objs += abd.o
-$(MODULE)-objs += aggsum.o
-$(MODULE)-objs += arc.o
-$(MODULE)-objs += blkptr.o
-$(MODULE)-objs += bplist.o
-$(MODULE)-objs += bpobj.o
-$(MODULE)-objs += bptree.o
-$(MODULE)-objs += btree.o
-$(MODULE)-objs += bqueue.o
-$(MODULE)-objs += dataset_kstats.o
-$(MODULE)-objs += dbuf.o
-$(MODULE)-objs += dbuf_stats.o
-$(MODULE)-objs += ddt.o
-$(MODULE)-objs += ddt_zap.o
-$(MODULE)-objs += dmu.o
-$(MODULE)-objs += dmu_diff.o
-$(MODULE)-objs += dmu_object.o
-$(MODULE)-objs += dmu_objset.o
-$(MODULE)-objs += dmu_recv.o
-$(MODULE)-objs += dmu_redact.o
-$(MODULE)-objs += dmu_send.o
-$(MODULE)-objs += dmu_traverse.o
-$(MODULE)-objs += dmu_tx.o
-$(MODULE)-objs += dmu_zfetch.o
-$(MODULE)-objs += dnode.o
-$(MODULE)-objs += dnode_sync.o
-$(MODULE)-objs += dsl_bookmark.o
-$(MODULE)-objs += dsl_crypt.o
-$(MODULE)-objs += dsl_dataset.o
-$(MODULE)-objs += dsl_deadlist.o
-$(MODULE)-objs += dsl_deleg.o
-$(MODULE)-objs += dsl_destroy.o
-$(MODULE)-objs += dsl_dir.o
-$(MODULE)-objs += dsl_pool.o
-$(MODULE)-objs += dsl_prop.o
-$(MODULE)-objs += dsl_scan.o
-$(MODULE)-objs += dsl_synctask.o
-$(MODULE)-objs += dsl_userhold.o
-$(MODULE)-objs += edonr_zfs.o
-$(MODULE)-objs += fm.o
-$(MODULE)-objs += gzip.o
-$(MODULE)-objs += hkdf.o
-$(MODULE)-objs += lz4.o
-$(MODULE)-objs += lzjb.o
-$(MODULE)-objs += metaslab.o
-$(MODULE)-objs += mmp.o
-$(MODULE)-objs += multilist.o
-$(MODULE)-objs += objlist.o
-$(MODULE)-objs += pathname.o
-$(MODULE)-objs += range_tree.o
-$(MODULE)-objs += refcount.o
-$(MODULE)-objs += rrwlock.o
-$(MODULE)-objs += sa.o
-$(MODULE)-objs += sha256.o
-$(MODULE)-objs += skein_zfs.o
-$(MODULE)-objs += spa.o
-$(MODULE)-objs += spa_boot.o
-$(MODULE)-objs += spa_checkpoint.o
-$(MODULE)-objs += spa_config.o
-$(MODULE)-objs += spa_errlog.o
-$(MODULE)-objs += spa_history.o
-$(MODULE)-objs += spa_log_spacemap.o
-$(MODULE)-objs += spa_misc.o
-$(MODULE)-objs += spa_stats.o
-$(MODULE)-objs += space_map.o
-$(MODULE)-objs += space_reftree.o
-$(MODULE)-objs += txg.o
-$(MODULE)-objs += uberblock.o
-$(MODULE)-objs += unique.o
-$(MODULE)-objs += vdev.o
-$(MODULE)-objs += vdev_cache.o
-$(MODULE)-objs += vdev_draid.o
-$(MODULE)-objs += vdev_draid_rand.o
-$(MODULE)-objs += vdev_indirect.o
-$(MODULE)-objs += vdev_indirect_births.o
-$(MODULE)-objs += vdev_indirect_mapping.o
-$(MODULE)-objs += vdev_initialize.o
-$(MODULE)-objs += vdev_label.o
-$(MODULE)-objs += vdev_mirror.o
-$(MODULE)-objs += vdev_missing.o
-$(MODULE)-objs += vdev_queue.o
-$(MODULE)-objs += vdev_raidz.o
-$(MODULE)-objs += vdev_raidz_math.o
-$(MODULE)-objs += vdev_raidz_math_scalar.o
-$(MODULE)-objs += vdev_rebuild.o
-$(MODULE)-objs += vdev_removal.o
-$(MODULE)-objs += vdev_root.o
-$(MODULE)-objs += vdev_trim.o
-$(MODULE)-objs += zap.o
-$(MODULE)-objs += zap_leaf.o
-$(MODULE)-objs += zap_micro.o
-$(MODULE)-objs += zcp.o
-$(MODULE)-objs += zcp_get.o
-$(MODULE)-objs += zcp_global.o
-$(MODULE)-objs += zcp_iter.o
-$(MODULE)-objs += zcp_set.o
-$(MODULE)-objs += zcp_synctask.o
-$(MODULE)-objs += zfeature.o
-$(MODULE)-objs += zfs_byteswap.o
-$(MODULE)-objs += zfs_fm.o
-$(MODULE)-objs += zfs_fuid.o
-$(MODULE)-objs += zfs_ioctl.o
-$(MODULE)-objs += zfs_log.o
-$(MODULE)-objs += zfs_onexit.o
-$(MODULE)-objs += zfs_quota.o
-$(MODULE)-objs += zfs_ratelimit.o
-$(MODULE)-objs += zfs_replay.o
-$(MODULE)-objs += zfs_rlock.o
-$(MODULE)-objs += zfs_sa.o
-$(MODULE)-objs += zfs_vnops.o
-$(MODULE)-objs += zil.o
-$(MODULE)-objs += zio.o
-$(MODULE)-objs += zio_checksum.o
-$(MODULE)-objs += zio_compress.o
-$(MODULE)-objs += zio_inject.o
-$(MODULE)-objs += zle.o
-$(MODULE)-objs += zrlock.o
-$(MODULE)-objs += zthr.o
-$(MODULE)-objs += zvol.o
-
-# Suppress incorrect warnings from versions of objtool which are not
-# aware of x86 EVEX prefix instructions used for AVX512.
-OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y
-OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y
-
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o
-$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o
-
-$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o
-$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o
-
-$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o
-$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o
-
-ifeq ($(CONFIG_ALTIVEC),y)
-$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec
-endif
-
-include $(mfdir)/../os/linux/zfs/Makefile
diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c
index bf39cd613330..2c0cda25dbc6 100644
--- a/sys/contrib/openzfs/module/zfs/abd.c
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -109,7 +109,6 @@ void
abd_verify(abd_t *abd)
{
#ifdef ZFS_DEBUG
- ASSERT3U(abd->abd_size, >, 0);
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
@@ -118,6 +117,7 @@ abd_verify(abd_t *abd)
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
+ ASSERT3U(abd->abd_size, >, 0);
ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
} else if (abd_is_gang(abd)) {
uint_t child_sizes = 0;
@@ -130,6 +130,7 @@ abd_verify(abd_t *abd)
}
ASSERT3U(abd->abd_size, ==, child_sizes);
} else {
+ ASSERT3U(abd->abd_size, >, 0);
abd_verify_scatter(abd);
}
#endif
@@ -369,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
* will retain all the free_on_free settings after being
* added to the parents list.
*/
+#ifdef ZFS_DEBUG
+ /*
+ * If cabd had abd_parent, we have to drop it here. We can't
+ * transfer it to pabd, nor we can clear abd_size leaving it.
+ */
+ if (cabd->abd_parent != NULL) {
+ (void) zfs_refcount_remove_many(
+ &cabd->abd_parent->abd_children,
+ cabd->abd_size, cabd);
+ cabd->abd_parent = NULL;
+ }
+#endif
pabd->abd_size += cabd->abd_size;
+ cabd->abd_size = 0;
list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
&ABD_GANG(cabd).abd_gang_chain);
ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
@@ -407,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
*/
if (abd_is_gang(cabd)) {
ASSERT(!list_link_active(&cabd->abd_gang_link));
- ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
return (abd_gang_add_gang(pabd, cabd, free_on_free));
}
ASSERT(!abd_is_gang(cabd));
@@ -667,15 +680,15 @@ abd_return_buf(abd_t *abd, void *buf, size_t n)
{
abd_verify(abd);
ASSERT3U(abd->abd_size, >=, n);
+#ifdef ZFS_DEBUG
+ (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
if (abd_is_linear(abd)) {
ASSERT3P(buf, ==, abd_to_buf(abd));
} else {
ASSERT0(abd_cmp_buf(abd, buf, n));
zio_buf_free(buf, n);
}
-#ifdef ZFS_DEBUG
- (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
-#endif
}
void
@@ -789,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
- boolean_t gang = abd_is_gang(abd);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
- /* If we are at the end of the gang ABD we are done */
- if (gang && !c_abd)
- break;
+ IMPLY(abd_is_gang(abd), c_abd != NULL);
abd_iter_map(&aiter);
@@ -816,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
return (ret);
}
+#if defined(__linux__) && defined(_KERNEL)
+int
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
+ abd_iter_page_func_t *func, void *private)
+{
+ struct abd_iter aiter;
+ int ret = 0;
+
+ if (size == 0)
+ return (0);
+
+ abd_verify(abd);
+ ASSERT3U(off + size, <=, abd->abd_size);
+
+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+ while (size > 0) {
+ IMPLY(abd_is_gang(abd), c_abd != NULL);
+
+ abd_iter_page(&aiter);
+
+ size_t len = MIN(aiter.iter_page_dsize, size);
+ ASSERT3U(len, >, 0);
+
+ ret = func(aiter.iter_page, aiter.iter_page_doff,
+ len, private);
+
+ aiter.iter_page = NULL;
+ aiter.iter_page_doff = 0;
+ aiter.iter_page_dsize = 0;
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+ }
+
+ return (ret);
+}
+#endif
+
struct buf_arg {
void *arg_buf;
};
@@ -889,10 +941,10 @@ abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
&ba_ptr);
}
-/*ARGSUSED*/
static int
abd_zero_off_cb(void *buf, size_t size, void *private)
{
+ (void) private;
(void) memset(buf, 0, size);
return (0);
}
@@ -917,7 +969,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
{
int ret = 0;
struct abd_iter daiter, saiter;
- boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
abd_t *c_dabd, *c_sabd;
if (size == 0)
@@ -929,16 +980,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
ASSERT3U(doff + size, <=, dabd->abd_size);
ASSERT3U(soff + size, <=, sabd->abd_size);
- dabd_is_gang_abd = abd_is_gang(dabd);
- sabd_is_gang_abd = abd_is_gang(sabd);
c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
while (size > 0) {
- /* if we are at the end of the gang ABD we are done */
- if ((dabd_is_gang_abd && !c_dabd) ||
- (sabd_is_gang_abd && !c_sabd))
- break;
+ IMPLY(abd_is_gang(dabd), c_dabd != NULL);
+ IMPLY(abd_is_gang(sabd), c_sabd != NULL);
abd_iter_map(&daiter);
abd_iter_map(&saiter);
@@ -967,10 +1014,10 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
return (ret);
}
-/*ARGSUSED*/
static int
abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
{
+ (void) private;
(void) memcpy(dbuf, sbuf, size);
return (0);
}
@@ -985,10 +1032,10 @@ abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
abd_copy_off_cb, NULL);
}
-/*ARGSUSED*/
static int
abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
{
+ (void) private;
return (memcmp(bufa, bufb, size));
}
@@ -1012,87 +1059,63 @@ abd_cmp(abd_t *dabd, abd_t *sabd)
* is the same when taking linear and when taking scatter
*/
void
-abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
- ssize_t csize, ssize_t dsize, const unsigned parity,
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,
+ size_t csize, size_t dsize, const unsigned parity,
void (*func_raidz_gen)(void **, const void *, size_t, size_t))
{
int i;
- ssize_t len, dlen;
+ size_t len, dlen;
struct abd_iter caiters[3];
- struct abd_iter daiter = {0};
- void *caddrs[3];
+ struct abd_iter daiter;
+ void *caddrs[3], *daddr;
unsigned long flags __maybe_unused = 0;
abd_t *c_cabds[3];
abd_t *c_dabd = NULL;
- boolean_t cabds_is_gang_abd[3];
- boolean_t dabd_is_gang_abd = B_FALSE;
ASSERT3U(parity, <=, 3);
-
for (i = 0; i < parity; i++) {
- cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
- c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
+ abd_verify(cabds[i]);
+ ASSERT3U(off + csize, <=, cabds[i]->abd_size);
+ c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off);
}
- if (dabd) {
- dabd_is_gang_abd = abd_is_gang(dabd);
- c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
+ if (dsize > 0) {
+ ASSERT(dabd);
+ abd_verify(dabd);
+ ASSERT3U(off + dsize, <=, dabd->abd_size);
+ c_dabd = abd_init_abd_iter(dabd, &daiter, off);
}
- ASSERT3S(dsize, >=, 0);
-
abd_enter_critical(flags);
while (csize > 0) {
- /* if we are at the end of the gang ABD we are done */
- if (dabd_is_gang_abd && !c_dabd)
- break;
-
+ len = csize;
for (i = 0; i < parity; i++) {
- /*
- * If we are at the end of the gang ABD we are
- * done.
- */
- if (cabds_is_gang_abd[i] && !c_cabds[i])
- break;
+ IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
abd_iter_map(&caiters[i]);
caddrs[i] = caiters[i].iter_mapaddr;
+ len = MIN(caiters[i].iter_mapsize, len);
}
- len = csize;
-
- if (dabd && dsize > 0)
+ if (dsize > 0) {
+ IMPLY(abd_is_gang(dabd), c_dabd != NULL);
abd_iter_map(&daiter);
-
- switch (parity) {
- case 3:
- len = MIN(caiters[2].iter_mapsize, len);
- fallthrough;
- case 2:
- len = MIN(caiters[1].iter_mapsize, len);
- fallthrough;
- case 1:
- len = MIN(caiters[0].iter_mapsize, len);
- }
-
- /* must be progressive */
- ASSERT3S(len, >, 0);
-
- if (dabd && dsize > 0) {
- /* this needs precise iter.length */
+ daddr = daiter.iter_mapaddr;
len = MIN(daiter.iter_mapsize, len);
dlen = len;
- } else
+ } else {
+ daddr = NULL;
dlen = 0;
+ }
/* must be progressive */
- ASSERT3S(len, >, 0);
+ ASSERT3U(len, >, 0);
/*
* The iterated function likely will not do well if each
* segment except the last one is not multiple of 512 (raidz).
*/
ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
- func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+ func_raidz_gen(caddrs, daddr, len, dlen);
for (i = parity-1; i >= 0; i--) {
abd_iter_unmap(&caiters[i]);
@@ -1101,7 +1124,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
&caiters[i], len);
}
- if (dabd && dsize > 0) {
+ if (dsize > 0) {
abd_iter_unmap(&daiter);
c_dabd =
abd_advance_abd_iter(dabd, c_dabd, &daiter,
@@ -1110,9 +1133,6 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
}
csize -= len;
-
- ASSERT3S(dsize, >=, 0);
- ASSERT3S(csize, >=, 0);
}
abd_exit_critical(flags);
}
@@ -1129,27 +1149,27 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
*/
void
abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
- ssize_t tsize, const unsigned parity,
+ size_t tsize, const unsigned parity,
void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
const unsigned *mul),
const unsigned *mul)
{
int i;
- ssize_t len;
+ size_t len;
struct abd_iter citers[3];
struct abd_iter xiters[3];
void *caddrs[3], *xaddrs[3];
unsigned long flags __maybe_unused = 0;
- boolean_t cabds_is_gang_abd[3];
- boolean_t tabds_is_gang_abd[3];
abd_t *c_cabds[3];
abd_t *c_tabds[3];
ASSERT3U(parity, <=, 3);
for (i = 0; i < parity; i++) {
- cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
- tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
+ abd_verify(cabds[i]);
+ abd_verify(tabds[i]);
+ ASSERT3U(tsize, <=, cabds[i]->abd_size);
+ ASSERT3U(tsize, <=, tabds[i]->abd_size);
c_cabds[i] =
abd_init_abd_iter(cabds[i], &citers[i], 0);
c_tabds[i] =
@@ -1158,36 +1178,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
abd_enter_critical(flags);
while (tsize > 0) {
-
+ len = tsize;
for (i = 0; i < parity; i++) {
- /*
- * If we are at the end of the gang ABD we
- * are done.
- */
- if (cabds_is_gang_abd[i] && !c_cabds[i])
- break;
- if (tabds_is_gang_abd[i] && !c_tabds[i])
- break;
+ IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
+ IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);
abd_iter_map(&citers[i]);
abd_iter_map(&xiters[i]);
caddrs[i] = citers[i].iter_mapaddr;
xaddrs[i] = xiters[i].iter_mapaddr;
+ len = MIN(citers[i].iter_mapsize, len);
+ len = MIN(xiters[i].iter_mapsize, len);
}
- len = tsize;
- switch (parity) {
- case 3:
- len = MIN(xiters[2].iter_mapsize, len);
- len = MIN(citers[2].iter_mapsize, len);
- fallthrough;
- case 2:
- len = MIN(xiters[1].iter_mapsize, len);
- len = MIN(citers[1].iter_mapsize, len);
- fallthrough;
- case 1:
- len = MIN(xiters[0].iter_mapsize, len);
- len = MIN(citers[0].iter_mapsize, len);
- }
/* must be progressive */
ASSERT3S(len, >, 0);
/*
diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c
index c4ea4f86fc5f..488c6ef3b6fc 100644
--- a/sys/contrib/openzfs/module/zfs/aggsum.c
+++ b/sys/contrib/openzfs/module/zfs/aggsum.c
@@ -87,7 +87,7 @@ static uint_t aggsum_borrow_shift = 4;
void
aggsum_init(aggsum_t *as, uint64_t value)
{
- bzero(as, sizeof (*as));
+ memset(as, 0, sizeof (*as));
as->as_lower_bound = as->as_upper_bound = value;
mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
/*
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index 79e2d4381830..30d30b98a6c6 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -108,12 +108,11 @@
* the active state mutex must be held before the ghost state mutex.
*
* It as also possible to register a callback which is run when the
- * arc_meta_limit is reached and no buffers can be safely evicted. In
+ * metadata limit is reached and no buffers can be safely evicted. In
* this case the arc user should drop a reference on some arc buffers so
- * they can be reclaimed and the arc_meta_limit honored. For example,
- * when using the ZPL each dentry holds a references on a znode. These
- * dentries must be pruned before the arc buffer holding the znode can
- * be safely evicted.
+ * they can be reclaimed. For example, when using the ZPL each dentry
+ * holds a references on a znode. These dentries must be pruned before
+ * the arc buffer holding the znode can be safely evicted.
*
* Note that the majority of the performance stats are manipulated
* with atomic operations.
@@ -250,7 +249,7 @@
* since the physical block is about to be rewritten. The new data contents
* will be contained in the arc_buf_t. As the I/O pipeline performs the write,
* it may compress the data before writing it to disk. The ARC will be called
- * with the transformed data and will bcopy the transformed on-disk block into
+ * with the transformed data and will memcpy the transformed on-disk block into
* a newly allocated b_pabd. Writes are always done into buffers which have
* either been loaned (and hence are new and don't have other readers) or
* buffers which have been released (and hence have their own hdr, if there
@@ -328,9 +327,12 @@ static zthr_t *arc_reap_zthr;
* arc_evict(), which improves arc_is_overflowing().
*/
static zthr_t *arc_evict_zthr;
+static arc_buf_hdr_t **arc_state_evict_markers;
+static int arc_state_evict_marker_count;
static kmutex_t arc_evict_lock;
static boolean_t arc_evict_needed = B_FALSE;
+static clock_t arc_last_uncached_flush;
/*
* Count of bytes evicted since boot.
@@ -352,7 +354,7 @@ static list_t arc_evict_waiters;
* can still happen, even during the potentially long time that arc_size is
* more than arc_c.
*/
-int zfs_arc_eviction_pct = 200;
+static uint_t zfs_arc_eviction_pct = 200;
/*
* The number of headers to evict in arc_evict_state_impl() before
@@ -361,24 +363,21 @@ int zfs_arc_eviction_pct = 200;
* oldest header in the arc state), but comes with higher overhead
* (i.e. more invocations of arc_evict_state_impl()).
*/
-int zfs_arc_evict_batch_limit = 10;
+static uint_t zfs_arc_evict_batch_limit = 10;
/* number of seconds before growing cache again */
-int arc_grow_retry = 5;
+uint_t arc_grow_retry = 5;
/*
* Minimum time between calls to arc_kmem_reap_soon().
*/
-int arc_kmem_cache_reap_retry_ms = 1000;
+static const int arc_kmem_cache_reap_retry_ms = 1000;
/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
-int zfs_arc_overflow_shift = 8;
-
-/* shift of arc_c for calculating both min and max arc_p */
-int arc_p_min_shift = 4;
+static int zfs_arc_overflow_shift = 8;
/* log2(fraction of arc to reclaim) */
-int arc_shrink_shift = 7;
+uint_t arc_shrink_shift = 7;
/* percent of pagecache to reclaim arc to */
#ifdef _KERNEL
@@ -394,20 +393,20 @@ uint_t zfs_arc_pc_percent = 0;
* This must be less than arc_shrink_shift, so that when we shrink the ARC,
* we will still not allow it to grow.
*/
-int arc_no_grow_shift = 5;
+uint_t arc_no_grow_shift = 5;
/*
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
-static int arc_min_prefetch_ms;
-static int arc_min_prescient_prefetch_ms;
+static uint_t arc_min_prefetch_ms;
+static uint_t arc_min_prescient_prefetch_ms;
/*
* If this percent of memory is free, don't throttle.
*/
-int arc_lotsfree_percent = 10;
+uint_t arc_lotsfree_percent = 10;
/*
* The arc has filled available memory and has now warmed up.
@@ -417,23 +416,23 @@ boolean_t arc_warm;
/*
* These tunables are for performance analysis.
*/
-unsigned long zfs_arc_max = 0;
-unsigned long zfs_arc_min = 0;
-unsigned long zfs_arc_meta_limit = 0;
-unsigned long zfs_arc_meta_min = 0;
-unsigned long zfs_arc_dnode_limit = 0;
-unsigned long zfs_arc_dnode_reduce_percent = 10;
-int zfs_arc_grow_retry = 0;
-int zfs_arc_shrink_shift = 0;
-int zfs_arc_p_min_shift = 0;
-int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+uint64_t zfs_arc_max = 0;
+uint64_t zfs_arc_min = 0;
+static uint64_t zfs_arc_dnode_limit = 0;
+static uint_t zfs_arc_dnode_reduce_percent = 10;
+static uint_t zfs_arc_grow_retry = 0;
+static uint_t zfs_arc_shrink_shift = 0;
+uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
/*
- * ARC dirty data constraints for arc_tempreserve_space() throttle.
+ * ARC dirty data constraints for arc_tempreserve_space() throttle:
+ * * total dirty data limit
+ * * anon block dirty limit
+ * * each pool's anon allowance
*/
-unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
-unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
-unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
+static const unsigned long zfs_arc_dirty_limit_percent = 50;
+static const unsigned long zfs_arc_anon_limit_percent = 25;
+static const unsigned long zfs_arc_pool_dirty_percent = 20;
/*
* Enable or disable compressed arc buffers.
@@ -441,51 +440,60 @@ unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
int zfs_compressed_arc_enabled = B_TRUE;
/*
- * ARC will evict meta buffers that exceed arc_meta_limit. This
- * tunable make arc_meta_limit adjustable for different workloads.
+ * Balance between metadata and data on ghost hits. Values above 100
+ * increase metadata caching by proportionally reducing effect of ghost
+ * data hits on target data/metadata rate.
*/
-unsigned long zfs_arc_meta_limit_percent = 75;
+static uint_t zfs_arc_meta_balance = 500;
/*
* Percentage that can be consumed by dnodes of ARC meta buffers.
*/
-unsigned long zfs_arc_dnode_limit_percent = 10;
+static uint_t zfs_arc_dnode_limit_percent = 10;
+
+/*
+ * These tunables are Linux-specific
+ */
+static uint64_t zfs_arc_sys_free = 0;
+static uint_t zfs_arc_min_prefetch_ms = 0;
+static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
+static uint_t zfs_arc_lotsfree_percent = 10;
/*
- * These tunables are Linux specific
+ * Number of arc_prune threads
*/
-unsigned long zfs_arc_sys_free = 0;
-int zfs_arc_min_prefetch_ms = 0;
-int zfs_arc_min_prescient_prefetch_ms = 0;
-int zfs_arc_p_dampener_disable = 1;
-int zfs_arc_meta_prune = 10000;
-int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
-int zfs_arc_meta_adjust_restarts = 4096;
-int zfs_arc_lotsfree_percent = 10;
+static int zfs_arc_prune_task_threads = 1;
-/* The 6 states: */
+/* The 7 states: */
arc_state_t ARC_anon;
arc_state_t ARC_mru;
arc_state_t ARC_mru_ghost;
arc_state_t ARC_mfu;
arc_state_t ARC_mfu_ghost;
arc_state_t ARC_l2c_only;
+arc_state_t ARC_uncached;
arc_stats_t arc_stats = {
{ "hits", KSTAT_DATA_UINT64 },
+ { "iohits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "demand_data_hits", KSTAT_DATA_UINT64 },
+ { "demand_data_iohits", KSTAT_DATA_UINT64 },
{ "demand_data_misses", KSTAT_DATA_UINT64 },
{ "demand_metadata_hits", KSTAT_DATA_UINT64 },
+ { "demand_metadata_iohits", KSTAT_DATA_UINT64 },
{ "demand_metadata_misses", KSTAT_DATA_UINT64 },
{ "prefetch_data_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_data_iohits", KSTAT_DATA_UINT64 },
{ "prefetch_data_misses", KSTAT_DATA_UINT64 },
{ "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
+ { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 },
{ "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
{ "mru_hits", KSTAT_DATA_UINT64 },
{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
{ "mfu_hits", KSTAT_DATA_UINT64 },
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "uncached_hits", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "access_skip", KSTAT_DATA_UINT64 },
@@ -502,7 +510,9 @@ arc_stats_t arc_stats = {
{ "hash_collisions", KSTAT_DATA_UINT64 },
{ "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 },
- { "p", KSTAT_DATA_UINT64 },
+ { "meta", KSTAT_DATA_UINT64 },
+ { "pd", KSTAT_DATA_UINT64 },
+ { "pm", KSTAT_DATA_UINT64 },
{ "c", KSTAT_DATA_UINT64 },
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
@@ -520,20 +530,35 @@ arc_stats_t arc_stats = {
{ "other_size", KSTAT_DATA_UINT64 },
#endif
{ "anon_size", KSTAT_DATA_UINT64 },
+ { "anon_data", KSTAT_DATA_UINT64 },
+ { "anon_metadata", KSTAT_DATA_UINT64 },
{ "anon_evictable_data", KSTAT_DATA_UINT64 },
{ "anon_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mru_size", KSTAT_DATA_UINT64 },
+ { "mru_data", KSTAT_DATA_UINT64 },
+ { "mru_metadata", KSTAT_DATA_UINT64 },
{ "mru_evictable_data", KSTAT_DATA_UINT64 },
{ "mru_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mru_ghost_size", KSTAT_DATA_UINT64 },
+ { "mru_ghost_data", KSTAT_DATA_UINT64 },
+ { "mru_ghost_metadata", KSTAT_DATA_UINT64 },
{ "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mfu_size", KSTAT_DATA_UINT64 },
+ { "mfu_data", KSTAT_DATA_UINT64 },
+ { "mfu_metadata", KSTAT_DATA_UINT64 },
{ "mfu_evictable_data", KSTAT_DATA_UINT64 },
{ "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
{ "mfu_ghost_size", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_data", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_metadata", KSTAT_DATA_UINT64 },
{ "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "uncached_size", KSTAT_DATA_UINT64 },
+ { "uncached_data", KSTAT_DATA_UINT64 },
+ { "uncached_metadata", KSTAT_DATA_UINT64 },
+ { "uncached_evictable_data", KSTAT_DATA_UINT64 },
+ { "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_prefetch_asize", KSTAT_DATA_UINT64 },
@@ -586,13 +611,14 @@ arc_stats_t arc_stats = {
{ "arc_loaned_bytes", KSTAT_DATA_UINT64 },
{ "arc_prune", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
- { "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_dnode_limit", KSTAT_DATA_UINT64 },
- { "arc_meta_max", KSTAT_DATA_UINT64 },
- { "arc_meta_min", KSTAT_DATA_UINT64 },
{ "async_upgrade_sync", KSTAT_DATA_UINT64 },
+ { "predictive_prefetch", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+ { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
+ { "prescient_prefetch", KSTAT_DATA_UINT64 },
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
+ { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
{ "arc_need_free", KSTAT_DATA_UINT64 },
{ "arc_sys_free", KSTAT_DATA_UINT64 },
{ "arc_raw_size", KSTAT_DATA_UINT64 },
@@ -646,7 +672,7 @@ arc_sums_t arc_sums;
ARCSTAT(stat) = x; \
} while (0)
-kstat_t *arc_ksp;
+static kstat_t *arc_ksp;
/*
* There are several ARC variables that are critical to export as kstats --
@@ -658,10 +684,7 @@ kstat_t *arc_ksp;
*/
#define arc_tempreserve ARCSTAT(arcstat_tempreserve)
#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
-#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
-/* max size for dnodes */
-#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit)
-#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */
hrtime_t arc_growtime;
@@ -683,6 +706,7 @@ taskq_t *arc_prune_taskq;
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED)
#define HDR_L2_READING(hdr) \
(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
@@ -724,8 +748,7 @@ taskq_t *arc_prune_taskq;
* Other sizes
*/
-#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
+#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
@@ -753,8 +776,8 @@ uint64_t zfs_crc64_table[256];
* Level 2 ARC
*/
-#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
-#define L2ARC_HEADROOM 2 /* num of writes */
+#define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 8 /* num of writes */
/*
* If we discover during ARC scan any buffers to be compressed, we boost
@@ -771,16 +794,16 @@ uint64_t zfs_crc64_table[256];
#define L2ARC_FEED_TYPES 4
/* L2ARC Performance Tunables */
-unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
-unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
-unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
-unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
-unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
-unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
+uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
+uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
int l2arc_feed_again = B_TRUE; /* turbo warmup */
int l2arc_norw = B_FALSE; /* no reads during writes */
-int l2arc_meta_percent = 33; /* limit on headers size */
+static uint_t l2arc_meta_percent = 33; /* limit on headers size */
/*
* L2ARC Internals
@@ -833,21 +856,24 @@ static kcondvar_t l2arc_rebuild_thr_cv;
enum arc_hdr_alloc_flags {
ARC_HDR_ALLOC_RDATA = 0x1,
- ARC_HDR_DO_ADAPT = 0x2,
ARC_HDR_USE_RESERVE = 0x4,
+ ARC_HDR_ALLOC_LINEAR = 0x8,
};
-static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int);
-static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
-static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int);
-static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
-static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
-static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
+ const void *tag);
static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
-static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static void arc_hdr_destroy(arc_buf_hdr_t *);
+static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
static void arc_buf_watch(arc_buf_t *);
+static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -860,6 +886,8 @@ static void l2arc_do_free_on_write(void);
static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
boolean_t state_only);
+static void arc_prune_async(uint64_t adjust);
+
#define l2arc_hdr_arcstats_increment(hdr) \
l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
#define l2arc_hdr_arcstats_decrement(hdr) \
@@ -881,7 +909,7 @@ int l2arc_exclude_special = 0;
* l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
* metadata and data are cached from ARC into L2ARC.
*/
-int l2arc_mfuonly = 0;
+static int l2arc_mfuonly = 0;
/*
* L2ARC TRIM
@@ -898,7 +926,7 @@ int l2arc_mfuonly = 0;
* will vary depending of how well the specific device handles
* these commands.
*/
-unsigned long l2arc_trim_ahead = 0;
+static uint64_t l2arc_trim_ahead = 0;
/*
* Performance tuning of L2ARC persistence:
@@ -913,12 +941,12 @@ unsigned long l2arc_trim_ahead = 0;
* data. In this case do not write log blocks in L2ARC in order
* not to waste space.
*/
-int l2arc_rebuild_enabled = B_TRUE;
-unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
+static int l2arc_rebuild_enabled = B_TRUE;
+static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
/* L2ARC persistence rebuild control routines. */
void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
-static void l2arc_dev_rebuild_thread(void *arg);
+static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
static int l2arc_rebuild(l2arc_dev_t *dev);
/* L2ARC persistence read I/O routines. */
@@ -938,7 +966,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
/* L2ARC persistence write I/O routines. */
-static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
l2arc_write_callback_t *cb);
/* L2ARC persistence auxiliary routines. */
@@ -986,7 +1014,7 @@ static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
const dva_t *dva = BP_IDENTITY(bp);
- uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+ uint64_t birth = BP_GET_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *hdr;
@@ -1086,15 +1114,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
*/
static kmem_cache_t *hdr_full_cache;
-static kmem_cache_t *hdr_full_crypt_cache;
static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
static void
buf_fini(void)
{
- int i;
-
#if defined(_KERNEL)
/*
* Large allocations which do not require contiguous pages
@@ -1106,10 +1131,9 @@ buf_fini(void)
kmem_free(buf_hash_table.ht_table,
(buf_hash_table.ht_mask + 1) * sizeof (void *));
#endif
- for (i = 0; i < BUF_LOCKS; i++)
+ for (int i = 0; i < BUF_LOCKS; i++)
mutex_destroy(BUF_HASH_LOCK(i));
kmem_cache_destroy(hdr_full_cache);
- kmem_cache_destroy(hdr_full_crypt_cache);
kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
@@ -1118,58 +1142,44 @@ buf_fini(void)
* Constructor callback - called when the cache is empty
* and a new buf is requested.
*/
-/* ARGSUSED */
static int
hdr_full_cons(void *vbuf, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
arc_buf_hdr_t *hdr = vbuf;
- bzero(hdr, HDR_FULL_SIZE);
+ memset(hdr, 0, HDR_FULL_SIZE);
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
+#ifdef ZFS_DEBUG
mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- list_link_init(&hdr->b_l1hdr.b_arc_node);
- list_link_init(&hdr->b_l2hdr.b_l2node);
+#endif
multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+ list_link_init(&hdr->b_l2hdr.b_l2node);
arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
return (0);
}
-/* ARGSUSED */
-static int
-hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
-{
- arc_buf_hdr_t *hdr = vbuf;
-
- hdr_full_cons(vbuf, unused, kmflag);
- bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
- arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
-
- return (0);
-}
-
-/* ARGSUSED */
static int
hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
arc_buf_hdr_t *hdr = vbuf;
- bzero(hdr, HDR_L2ONLY_SIZE);
+ memset(hdr, 0, HDR_L2ONLY_SIZE);
arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
return (0);
}
-/* ARGSUSED */
static int
buf_cons(void *vbuf, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
arc_buf_t *buf = vbuf;
- bzero(buf, sizeof (arc_buf_t));
- mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ memset(buf, 0, sizeof (arc_buf_t));
arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
return (0);
@@ -1179,47 +1189,37 @@ buf_cons(void *vbuf, void *unused, int kmflag)
* Destructor callback - called when a cached buf is
* no longer required.
*/
-/* ARGSUSED */
static void
hdr_full_dest(void *vbuf, void *unused)
{
+ (void) unused;
arc_buf_hdr_t *hdr = vbuf;
ASSERT(HDR_EMPTY(hdr));
- cv_destroy(&hdr->b_l1hdr.b_cv);
zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+#ifdef ZFS_DEBUG
mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+#endif
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
}
-/* ARGSUSED */
-static void
-hdr_full_crypt_dest(void *vbuf, void *unused)
-{
- arc_buf_hdr_t *hdr = vbuf;
-
- hdr_full_dest(vbuf, unused);
- arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
-}
-
-/* ARGSUSED */
static void
hdr_l2only_dest(void *vbuf, void *unused)
{
- arc_buf_hdr_t *hdr __maybe_unused = vbuf;
+ (void) unused;
+ arc_buf_hdr_t *hdr = vbuf;
ASSERT(HDR_EMPTY(hdr));
arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
}
-/* ARGSUSED */
static void
buf_dest(void *vbuf, void *unused)
{
- arc_buf_t *buf = vbuf;
+ (void) unused;
+ (void) vbuf;
- mutex_destroy(&buf->b_evict_lock);
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
@@ -1259,9 +1259,6 @@ retry:
hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
- hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
- HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
- NULL, NULL, NULL, 0);
hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
NULL, NULL, 0);
@@ -1324,9 +1321,9 @@ arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
ASSERT(HDR_PROTECTED(hdr));
- bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
- bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
- bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
+ memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+ memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+ memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
}
@@ -1369,7 +1366,7 @@ arc_buf_is_shared(arc_buf_t *buf)
abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
- IMPLY(shared, ARC_BUF_SHARED(buf));
+ EQUIV(shared, ARC_BUF_SHARED(buf));
IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
/*
@@ -1387,6 +1384,7 @@ arc_buf_is_shared(arc_buf_t *buf)
static inline void
arc_cksum_free(arc_buf_hdr_t *hdr)
{
+#ifdef ZFS_DEBUG
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
@@ -1395,6 +1393,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
hdr->b_l1hdr.b_freeze_cksum = NULL;
}
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#endif
}
/*
@@ -1423,6 +1422,7 @@ arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
static void
arc_cksum_verify(arc_buf_t *buf)
{
+#ifdef ZFS_DEBUG
arc_buf_hdr_t *hdr = buf->b_hdr;
zio_cksum_t zc;
@@ -1445,6 +1445,7 @@ arc_cksum_verify(arc_buf_t *buf)
if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#endif
}
/*
@@ -1485,14 +1486,13 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
static void
arc_cksum_compute(arc_buf_t *buf)
{
- arc_buf_hdr_t *hdr = buf->b_hdr;
-
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+#ifdef ZFS_DEBUG
+ arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(HDR_HAS_L1HDR(hdr));
-
- mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
@@ -1505,6 +1505,7 @@ arc_cksum_compute(arc_buf_t *buf)
fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
hdr->b_l1hdr.b_freeze_cksum);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#endif
arc_buf_watch(buf);
}
@@ -1512,11 +1513,11 @@ arc_cksum_compute(arc_buf_t *buf)
void
arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
{
+ (void) sig, (void) unused;
panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
}
#endif
-/* ARGSUSED */
static void
arc_buf_unwatch(arc_buf_t *buf)
{
@@ -1525,10 +1526,11 @@ arc_buf_unwatch(arc_buf_t *buf)
ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
PROT_READ | PROT_WRITE));
}
+#else
+ (void) buf;
#endif
}
-/* ARGSUSED */
static void
arc_buf_watch(arc_buf_t *buf)
{
@@ -1536,6 +1538,8 @@ arc_buf_watch(arc_buf_t *buf)
if (arc_watch)
ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
PROT_READ));
+#else
+ (void) buf;
#endif
}
@@ -1681,18 +1685,20 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
}
if (!ARC_BUF_COMPRESSED(from)) {
- bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+ memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
copied = B_TRUE;
break;
}
}
+#ifdef ZFS_DEBUG
/*
* There were no decompressed bufs, so there should not be a
* checksum on the hdr either.
*/
if (zfs_flags & ZFS_DEBUG_MODIFY)
EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+#endif
return (copied);
}
@@ -1778,12 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
!HDR_COMPRESSION_ENABLED(hdr)) {
- tmpbuf = zio_buf_alloc(lsize);
- abd = abd_get_from_buf(tmpbuf, lsize);
- abd_take_ownership_of_buf(abd, B_TRUE);
+
csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
- hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel);
+ hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
+ ASSERT3P(tmpbuf, !=, NULL);
ASSERT3U(csize, <=, psize);
+ abd = abd_get_from_buf(tmpbuf, lsize);
+ abd_take_ownership_of_buf(abd, B_TRUE);
abd_zero_off(abd, csize, psize - csize);
}
@@ -1836,7 +1843,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT(HDR_ENCRYPTED(hdr));
- arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+ arc_hdr_alloc_abd(hdr, 0);
ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
@@ -1863,8 +1870,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
* and then loan a buffer from it, rather than allocating a
* linear buffer and wrapping it in an abd later.
*/
- cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
- ARC_HDR_DO_ADAPT);
+ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@@ -1947,20 +1953,19 @@ error:
* arc_buf_fill().
*/
static void
-arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
+arc_buf_untransform_in_place(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(HDR_ENCRYPTED(hdr));
ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);
zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
- hdr->b_crypt_hdr.b_ebufcnt -= 1;
}
/*
@@ -1995,7 +2000,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
IMPLY(encrypted, HDR_ENCRYPTED(hdr));
IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
- IMPLY(encrypted, !ARC_BUF_SHARED(buf));
+ IMPLY(encrypted, !arc_buf_is_shared(buf));
/*
* If the caller wanted encrypted data we just need to copy it from
@@ -2051,7 +2056,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
if (hash_lock != NULL)
mutex_enter(hash_lock);
- arc_buf_untransform_in_place(buf, hash_lock);
+ arc_buf_untransform_in_place(buf);
if (hash_lock != NULL)
mutex_exit(hash_lock);
@@ -2063,21 +2068,23 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
}
if (hdr_compressed == compressed) {
- if (!arc_buf_is_shared(buf)) {
+ if (ARC_BUF_SHARED(buf)) {
+ ASSERT(arc_buf_is_shared(buf));
+ } else {
abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
arc_buf_size(buf));
}
} else {
ASSERT(hdr_compressed);
ASSERT(!compressed);
- ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
/*
* If the buf is sharing its data with the hdr, unlink it and
* allocate a new data buffer for the buf.
*/
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_COMPRESSED(buf));
+ if (ARC_BUF_SHARED(buf)) {
+ ASSERTF(ARC_BUF_COMPRESSED(buf),
+ "buf %p was uncompressed", buf);
/* We need to give the buf its own b_data */
buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
@@ -2088,6 +2095,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
/* Previously overhead was 0; just add new overhead */
ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
} else if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(!arc_buf_is_shared(buf));
+
/* We need to reallocate the buf's b_data */
arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
buf);
@@ -2175,7 +2184,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
* (and generate an ereport) before leaving the ARC.
*/
ret = SET_ERROR(EIO);
- spa_log_error(spa, zb);
+ spa_log_error(spa, zb, buf->b_hdr->b_birth);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0);
}
@@ -2196,7 +2205,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(state)) {
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
@@ -2216,7 +2224,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf))
+ if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_add_many(&state->arcs_esize[type],
arc_buf_size(buf), buf);
@@ -2236,7 +2244,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(state)) {
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
@@ -2256,7 +2263,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf))
+ if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_remove_many(&state->arcs_esize[type],
arc_buf_size(buf), buf);
@@ -2270,33 +2277,22 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
* it is not evictable.
*/
static void
-add_reference(arc_buf_hdr_t *hdr, void *tag)
+add_reference(arc_buf_hdr_t *hdr, const void *tag)
{
- arc_state_t *state;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(HDR_HAS_L1HDR(hdr));
if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
- ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+ ASSERT(state == arc_anon);
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
}
- state = hdr->b_l1hdr.b_state;
-
if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
- (state != arc_anon)) {
+ state != arc_anon && state != arc_l2c_only) {
/* We don't use the L2-only state list. */
- if (state != arc_l2c_only) {
- multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
- hdr);
- arc_evictable_space_decrement(hdr, state);
- }
- /* remove the prefetch flag if we get a reference */
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_decrement_state(hdr);
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_increment_state(hdr);
+ multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
+ arc_evictable_space_decrement(hdr, state);
}
}
@@ -2306,26 +2302,30 @@ add_reference(arc_buf_hdr_t *hdr, void *tag)
* list making it eligible for eviction.
*/
static int
-remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
+remove_reference(arc_buf_hdr_t *hdr, const void *tag)
{
int cnt;
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
- ASSERT(!GHOST_STATE(state));
+ ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
+ ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */
- /*
- * arc_l2c_only counts as a ghost state so we don't need to explicitly
- * check to prevent usage of the arc_l2c_only list.
- */
- if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
- (state != arc_anon)) {
- multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
- arc_evictable_space_increment(hdr, state);
+ if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
+ return (cnt);
+
+ if (state == arc_anon) {
+ arc_hdr_destroy(hdr);
+ return (0);
+ }
+ if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
+ arc_change_state(arc_anon, hdr);
+ arc_hdr_destroy(hdr);
+ return (0);
}
- return (cnt);
+ multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
+ arc_evictable_space_increment(hdr, state);
+ return (0);
}
/*
@@ -2338,6 +2338,7 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
void
arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
{
+ (void) state_index;
arc_buf_hdr_t *hdr = ab->b_hdr;
l1arc_buf_hdr_t *l1hdr = NULL;
l2arc_buf_hdr_t *l2hdr = NULL;
@@ -2358,7 +2359,9 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
l2hdr = &hdr->b_l2hdr;
if (l1hdr) {
- abi->abi_bufcnt = l1hdr->b_bufcnt;
+ abi->abi_bufcnt = 0;
+ for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
+ abi->abi_bufcnt++;
abi->abi_access = l1hdr->b_arc_access;
abi->abi_mru_hits = l1hdr->b_mru_hits;
abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
@@ -2382,14 +2385,12 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
* for the buffer must be held by the caller.
*/
static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
- kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
{
arc_state_t *old_state;
int64_t refcnt;
- uint32_t bufcnt;
boolean_t update_old, update_new;
- arc_buf_contents_t buftype = arc_buf_type(hdr);
+ arc_buf_contents_t type = arc_buf_type(hdr);
/*
* We almost always have an L1 hdr here, since we call arc_hdr_realloc()
@@ -2401,21 +2402,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (HDR_HAS_L1HDR(hdr)) {
old_state = hdr->b_l1hdr.b_state;
refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
- bufcnt = hdr->b_l1hdr.b_bufcnt;
- update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
- HDR_HAS_RABD(hdr));
+ update_old = (hdr->b_l1hdr.b_buf != NULL ||
+ hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+
+ IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
+ IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
+ IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
+ ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
} else {
old_state = arc_l2c_only;
refcnt = 0;
- bufcnt = 0;
update_old = B_FALSE;
}
update_new = update_old;
+ if (GHOST_STATE(old_state))
+ update_old = B_TRUE;
+ if (GHOST_STATE(new_state))
+ update_new = B_TRUE;
- ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
ASSERT3P(new_state, !=, old_state);
- ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
- ASSERT(old_state != arc_anon || bufcnt <= 1);
/*
* If this buffer is evictable, transfer it from the
@@ -2424,14 +2430,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (refcnt == 0) {
if (old_state != arc_anon && old_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
- multilist_remove(&old_state->arcs_list[buftype], hdr);
-
- if (GHOST_STATE(old_state)) {
- ASSERT0(bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- update_old = B_TRUE;
+ /* remove_reference() saves on insert. */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ multilist_remove(&old_state->arcs_list[type],
+ hdr);
+ arc_evictable_space_decrement(hdr, old_state);
}
- arc_evictable_space_decrement(hdr, old_state);
}
if (new_state != arc_anon && new_state != arc_l2c_only) {
/*
@@ -2441,13 +2445,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* beforehand.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
- multilist_insert(&new_state->arcs_list[buftype], hdr);
-
- if (GHOST_STATE(new_state)) {
- ASSERT0(bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- update_new = B_TRUE;
- }
+ multilist_insert(&new_state->arcs_list[type], hdr);
arc_evictable_space_increment(hdr, new_state);
}
}
@@ -2461,21 +2459,19 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (update_new && new_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(new_state)) {
- ASSERT0(bufcnt);
/*
* When moving a header to a ghost state, we first
- * remove all arc buffers. Thus, we'll have a
- * bufcnt of zero, and no arc buffer to use for
- * the reference. As a result, we use the arc
- * header pointer for the reference.
+ * remove all arc buffers. Thus, we'll have no arc
+ * buffer to use for the reference. As a result, we
+ * use the arc header pointer for the reference.
*/
- (void) zfs_refcount_add_many(&new_state->arcs_size,
+ (void) zfs_refcount_add_many(
+ &new_state->arcs_size[type],
HDR_GET_LSIZE(hdr), hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
} else {
- uint32_t buffers = 0;
/*
* Each individual buffer holds a unique reference,
@@ -2484,8 +2480,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- ASSERT3U(bufcnt, !=, 0);
- buffers++;
/*
* When the arc_buf_t is sharing the data
@@ -2494,24 +2488,23 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf))
+ if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_add_many(
- &new_state->arcs_size,
+ &new_state->arcs_size[type],
arc_buf_size(buf), buf);
}
- ASSERT3U(bufcnt, ==, buffers);
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) zfs_refcount_add_many(
- &new_state->arcs_size,
+ &new_state->arcs_size[type],
arc_hdr_size(hdr), hdr);
}
if (HDR_HAS_RABD(hdr)) {
(void) zfs_refcount_add_many(
- &new_state->arcs_size,
+ &new_state->arcs_size[type],
HDR_GET_PSIZE(hdr), hdr);
}
}
@@ -2520,7 +2513,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
if (update_old && old_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
- ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
@@ -2532,10 +2524,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* header on the ghost state.
*/
- (void) zfs_refcount_remove_many(&old_state->arcs_size,
+ (void) zfs_refcount_remove_many(
+ &old_state->arcs_size[type],
HDR_GET_LSIZE(hdr), hdr);
} else {
- uint32_t buffers = 0;
/*
* Each individual buffer holds a unique reference,
@@ -2544,8 +2536,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- ASSERT3U(bufcnt, !=, 0);
- buffers++;
/*
* When the arc_buf_t is sharing the data
@@ -2554,27 +2544,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf))
+ if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_remove_many(
- &old_state->arcs_size, arc_buf_size(buf),
- buf);
+ &old_state->arcs_size[type],
+ arc_buf_size(buf), buf);
}
- ASSERT3U(bufcnt, ==, buffers);
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
HDR_HAS_RABD(hdr));
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) zfs_refcount_remove_many(
- &old_state->arcs_size, arc_hdr_size(hdr),
- hdr);
+ &old_state->arcs_size[type],
+ arc_hdr_size(hdr), hdr);
}
if (HDR_HAS_RABD(hdr)) {
(void) zfs_refcount_remove_many(
- &old_state->arcs_size, HDR_GET_PSIZE(hdr),
- hdr);
+ &old_state->arcs_size[type],
+ HDR_GET_PSIZE(hdr), hdr);
}
}
}
@@ -2608,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
ARCSTAT_INCR(arcstat_bonus_size, space);
break;
case ARC_SPACE_DNODE:
- aggsum_add(&arc_sums.arcstat_dnode_size, space);
+ ARCSTAT_INCR(arcstat_dnode_size, space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, space);
@@ -2631,7 +2620,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
}
if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
- aggsum_add(&arc_sums.arcstat_meta_used, space);
+ ARCSTAT_INCR(arcstat_meta_used, space);
aggsum_add(&arc_sums.arcstat_size, space);
}
@@ -2654,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
ARCSTAT_INCR(arcstat_bonus_size, -space);
break;
case ARC_SPACE_DNODE:
- aggsum_add(&arc_sums.arcstat_dnode_size, -space);
+ ARCSTAT_INCR(arcstat_dnode_size, -space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, -space);
@@ -2670,13 +2659,8 @@ arc_space_return(uint64_t space, arc_space_type_t type)
break;
}
- if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) {
- ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used,
- space) >= 0);
- ARCSTAT_MAX(arcstat_meta_max,
- aggsum_upper_bound(&arc_sums.arcstat_meta_used));
- aggsum_add(&arc_sums.arcstat_meta_used, -space);
- }
+ if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
+ ARCSTAT_INCR(arcstat_meta_used, -space);
ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
aggsum_add(&arc_sums.arcstat_size, -space);
@@ -2729,8 +2713,8 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
static int
arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
- void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
- boolean_t fill, arc_buf_t **ret)
+ const void *tag, boolean_t encrypted, boolean_t compressed,
+ boolean_t noauth, boolean_t fill, arc_buf_t **ret)
{
arc_buf_t *buf;
arc_fill_flags_t flags = ARC_FILL_LOCKED;
@@ -2814,9 +2798,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
VERIFY3P(buf->b_data, !=, NULL);
hdr->b_l1hdr.b_buf = buf;
- hdr->b_l1hdr.b_bufcnt += 1;
- if (encrypted)
- hdr->b_crypt_hdr.b_ebufcnt += 1;
/*
* If the user wants the data from the hdr, we need to either copy or
@@ -2830,7 +2811,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
return (0);
}
-static char *arc_onloan_tag = "onloan";
+static const char *arc_onloan_tag = "onloan";
static inline void
arc_loaned_bytes_update(int64_t delta)
@@ -2889,7 +2870,7 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
* Return a loaned arc buffer to the arc.
*/
void
-arc_return_buf(arc_buf_t *buf, void *tag)
+arc_return_buf(arc_buf_t *buf, const void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
@@ -2903,7 +2884,7 @@ arc_return_buf(arc_buf_t *buf, void *tag)
/* Detach an arc_buf from a dbuf (tag) */
void
-arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
@@ -2943,7 +2924,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
(void) zfs_refcount_remove_many(&state->arcs_esize[type],
size, hdr);
}
- (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
+ (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
if (type == ARC_BUFC_METADATA) {
arc_space_return(size, ARC_SPACE_META);
} else {
@@ -2976,7 +2957,8 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* refcount ownership to the hdr since it always owns
* the refcount whenever an arc_buf_t is shared.
*/
- zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+ zfs_refcount_transfer_ownership_many(
+ &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
arc_hdr_size(hdr), buf, hdr);
hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
@@ -3005,7 +2987,8 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* We are no longer sharing this buffer so we need
* to transfer its ownership to the rightful owner.
*/
- zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+ zfs_refcount_transfer_ownership_many(
+ &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
arc_hdr_size(hdr), hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
@@ -3056,8 +3039,6 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
}
buf->b_next = NULL;
ASSERT3P(lastbuf, !=, buf);
- IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
- IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
return (lastbuf);
@@ -3087,31 +3068,30 @@ arc_buf_destroy_impl(arc_buf_t *buf)
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
- if (arc_buf_is_shared(buf)) {
+ if (ARC_BUF_SHARED(buf)) {
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
+ ASSERT(!arc_buf_is_shared(buf));
uint64_t size = arc_buf_size(buf);
arc_free_data_buf(hdr, buf->b_data, size, buf);
ARCSTAT_INCR(arcstat_overhead_size, -size);
}
buf->b_data = NULL;
- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
- hdr->b_l1hdr.b_bufcnt -= 1;
-
- if (ARC_BUF_ENCRYPTED(buf)) {
- hdr->b_crypt_hdr.b_ebufcnt -= 1;
-
- /*
- * If we have no more encrypted buffers and we've
- * already gotten a copy of the decrypted data we can
- * free b_rabd to save some space.
- */
- if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
- HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
- !HDR_IO_IN_PROGRESS(hdr)) {
- arc_hdr_free_abd(hdr, B_TRUE);
+ /*
+ * If we have no more encrypted buffers and we've already
+ * gotten a copy of the decrypted data we can free b_rabd
+ * to save some space.
+ */
+ if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
+ hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
+ arc_buf_t *b;
+ for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
+ if (b != buf && ARC_BUF_ENCRYPTED(b))
+ break;
}
+ if (b == NULL)
+ arc_hdr_free_abd(hdr, B_TRUE);
}
}
@@ -3132,9 +3112,9 @@ arc_buf_destroy_impl(arc_buf_t *buf)
*/
if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
/* Only one buf can be shared at once */
- VERIFY(!arc_buf_is_shared(lastbuf));
+ ASSERT(!arc_buf_is_shared(lastbuf));
/* hdr is uncompressed so can't have compressed buf */
- VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
+ ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
arc_hdr_free_abd(hdr, B_FALSE);
@@ -3272,14 +3252,12 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
arc_buf_hdr_t *hdr;
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
- if (protected) {
- hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
- } else {
- hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
- }
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
ASSERT(HDR_EMPTY(hdr));
+#ifdef ZFS_DEBUG
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
HDR_SET_PSIZE(hdr, psize);
HDR_SET_LSIZE(hdr, lsize);
hdr->b_spa = spa;
@@ -3297,7 +3275,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
- hdr->b_l1hdr.b_bufcnt = 0;
hdr->b_l1hdr.b_buf = NULL;
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -3323,24 +3300,14 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
(old == hdr_l2only_cache && new == hdr_full_cache));
- /*
- * if the caller wanted a new full header and the header is to be
- * encrypted we will actually allocate the header from the full crypt
- * cache instead. The same applies to freeing from the old cache.
- */
- if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
- new = hdr_full_crypt_cache;
- if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
- old = hdr_full_crypt_cache;
-
nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
buf_hash_remove(hdr);
- bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+ memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
- if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
+ if (new == hdr_full_cache) {
arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
/*
* arc_access and arc_change_state need to be aware that a
@@ -3354,8 +3321,9 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
ASSERT(!HDR_HAS_RABD(hdr));
} else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
+#ifdef ZFS_DEBUG
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
/*
* If we've reached here, We must have been called from
@@ -3419,125 +3387,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
}
/*
- * This function allows an L1 header to be reallocated as a crypt
- * header and vice versa. If we are going to a crypt header, the
- * new fields will be zeroed out.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
-{
- arc_buf_hdr_t *nhdr;
- arc_buf_t *buf;
- kmem_cache_t *ncache, *ocache;
-
- /*
- * This function requires that hdr is in the arc_anon state.
- * Therefore it won't have any L2ARC data for us to worry
- * about copying.
- */
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(!HDR_HAS_L2HDR(hdr));
- ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
- ASSERT3P(hdr->b_hash_next, ==, NULL);
-
- if (need_crypt) {
- ncache = hdr_full_crypt_cache;
- ocache = hdr_full_cache;
- } else {
- ncache = hdr_full_cache;
- ocache = hdr_full_crypt_cache;
- }
-
- nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
-
- /*
- * Copy all members that aren't locks or condvars to the new header.
- * No lists are pointing to us (as we asserted above), so we don't
- * need to worry about the list nodes.
- */
- nhdr->b_dva = hdr->b_dva;
- nhdr->b_birth = hdr->b_birth;
- nhdr->b_type = hdr->b_type;
- nhdr->b_flags = hdr->b_flags;
- nhdr->b_psize = hdr->b_psize;
- nhdr->b_lsize = hdr->b_lsize;
- nhdr->b_spa = hdr->b_spa;
- nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
- nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
- nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
- nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
- nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
- nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
- nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
- nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
- nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
- nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
- nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
-
- /*
- * This zfs_refcount_add() exists only to ensure that the individual
- * arc buffers always point to a header that is referenced, avoiding
- * a small race condition that could trigger ASSERTs.
- */
- (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
- nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
- for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
- mutex_enter(&buf->b_evict_lock);
- buf->b_hdr = nhdr;
- mutex_exit(&buf->b_evict_lock);
- }
-
- zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
- (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
- ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
-
- if (need_crypt) {
- arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
- } else {
- arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
- }
-
- /* unset all members of the original hdr */
- bzero(&hdr->b_dva, sizeof (dva_t));
- hdr->b_birth = 0;
- hdr->b_type = ARC_BUFC_INVALID;
- hdr->b_flags = 0;
- hdr->b_psize = 0;
- hdr->b_lsize = 0;
- hdr->b_spa = 0;
- hdr->b_l1hdr.b_freeze_cksum = NULL;
- hdr->b_l1hdr.b_buf = NULL;
- hdr->b_l1hdr.b_bufcnt = 0;
- hdr->b_l1hdr.b_byteswap = 0;
- hdr->b_l1hdr.b_state = NULL;
- hdr->b_l1hdr.b_arc_access = 0;
- hdr->b_l1hdr.b_mru_hits = 0;
- hdr->b_l1hdr.b_mru_ghost_hits = 0;
- hdr->b_l1hdr.b_mfu_hits = 0;
- hdr->b_l1hdr.b_mfu_ghost_hits = 0;
- hdr->b_l1hdr.b_acb = NULL;
- hdr->b_l1hdr.b_pabd = NULL;
-
- if (ocache == hdr_full_crypt_cache) {
- ASSERT(!HDR_HAS_RABD(hdr));
- hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
- hdr->b_crypt_hdr.b_ebufcnt = 0;
- hdr->b_crypt_hdr.b_dsobj = 0;
- bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
- bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
- bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
- }
-
- buf_discard_identity(hdr);
- kmem_cache_free(ocache, hdr);
-
- return (nhdr);
-}
-
-/*
* This function is used by the send / receive code to convert a newly
* allocated arc_buf_t to one that is suitable for a raw encrypted write. It
* is also used to allow the root objset block to be updated without altering
@@ -3556,8 +3405,7 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
- if (!HDR_PROTECTED(hdr))
- hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
hdr->b_crypt_hdr.b_dsobj = dsobj;
hdr->b_crypt_hdr.b_ot = ot;
hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
@@ -3566,11 +3414,11 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
arc_cksum_free(hdr);
if (salt != NULL)
- bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+ memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
if (iv != NULL)
- bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+ memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
if (mac != NULL)
- bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+ memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
}
/*
@@ -3578,7 +3426,8 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
* The buf is returned thawed since we expect the consumer to modify it.
*/
arc_buf_t *
-arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
+arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
+ int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
B_FALSE, ZIO_COMPRESS_OFF, 0, type);
@@ -3596,8 +3445,8 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
* for bufs containing metadata.
*/
arc_buf_t *
-arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
- enum zio_compress compression_type, uint8_t complevel)
+arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
+ uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
{
ASSERT3U(lsize, >, 0);
ASSERT3U(lsize, >=, psize);
@@ -3611,7 +3460,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
B_TRUE, B_FALSE, B_FALSE, &buf));
arc_buf_thaw(buf);
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
/*
* To ensure that the hdr has the correct data in it if we call
@@ -3624,9 +3472,9 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
}
arc_buf_t *
-arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
- const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
- dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
+ boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
+ const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
enum zio_compress compression_type, uint8_t complevel)
{
arc_buf_hdr_t *hdr;
@@ -3646,9 +3494,9 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
hdr->b_crypt_hdr.b_ot = ot;
hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
- bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
- bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
- bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+ memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
+ memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
+ memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
/*
* This buffer will be considered encrypted even if the ot is not an
@@ -3659,7 +3507,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
B_FALSE, B_FALSE, &buf));
arc_buf_thaw(buf);
- ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
return (buf);
}
@@ -3759,8 +3606,6 @@ static void
arc_hdr_destroy(arc_buf_hdr_t *hdr)
{
if (HDR_HAS_L1HDR(hdr)) {
- ASSERT(hdr->b_l1hdr.b_buf == NULL ||
- hdr->b_l1hdr.b_bufcnt > 0);
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
}
@@ -3821,27 +3666,25 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
if (HDR_HAS_L1HDR(hdr)) {
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-
- if (!HDR_PROTECTED(hdr)) {
- kmem_cache_free(hdr_full_cache, hdr);
- } else {
- kmem_cache_free(hdr_full_crypt_cache, hdr);
- }
+#ifdef ZFS_DEBUG
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
+ kmem_cache_free(hdr_full_cache, hdr);
} else {
kmem_cache_free(hdr_l2only_cache, hdr);
}
}
void
-arc_buf_destroy(arc_buf_t *buf, void* tag)
+arc_buf_destroy(arc_buf_t *buf, const void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
if (hdr->b_l1hdr.b_state == arc_anon) {
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+ ASSERT(ARC_BUF_LAST(buf));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- VERIFY0(remove_reference(hdr, NULL, tag));
- arc_hdr_destroy(hdr);
+ VERIFY0(remove_reference(hdr, tag));
return;
}
@@ -3849,13 +3692,13 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
mutex_enter(hash_lock);
ASSERT3P(hdr, ==, buf->b_hdr);
- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
ASSERT3P(buf->b_data, !=, NULL);
- (void) remove_reference(hdr, hash_lock, tag);
arc_buf_destroy_impl(buf);
+ (void) remove_reference(hdr, tag);
mutex_exit(hash_lock);
}
@@ -3870,6 +3713,7 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
* - arc_mru_ghost -> deleted
* - arc_mfu_ghost -> arc_l2c_only
* - arc_mfu_ghost -> deleted
+ * - arc_uncached -> deleted
*
* Return total size of evicted data buffers for eviction progress tracking.
* When evicting from ghost states return logical buffer size to make eviction
@@ -3881,21 +3725,22 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
* only the evicted headers size.
*/
static int64_t
-arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
+arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
- int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+ uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
- ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
*real_evicted = 0;
state = hdr->b_l1hdr.b_state;
if (GHOST_STATE(state)) {
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
/*
* l2arc_write_buffers() relies on a header's L1 portion
@@ -3921,49 +3766,34 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
* This buffer is cached on the 2nd Level ARC;
* don't destroy the header.
*/
- arc_change_state(arc_l2c_only, hdr, hash_lock);
+ arc_change_state(arc_l2c_only, hdr);
/*
* dropping from L1+L2 cached to L2-only,
* realloc to remove the L1 header.
*/
- hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ (void) arc_hdr_realloc(hdr, hdr_full_cache,
hdr_l2only_cache);
*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
} else {
- arc_change_state(arc_anon, hdr, hash_lock);
+ arc_change_state(arc_anon, hdr);
arc_hdr_destroy(hdr);
*real_evicted += HDR_FULL_SIZE;
}
return (bytes_evicted);
}
- ASSERT(state == arc_mru || state == arc_mfu);
- evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+ ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
+ evicted_state = (state == arc_uncached) ? arc_anon :
+ ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
/* prefetch buffers have a minimum lifespan */
- if (HDR_IO_IN_PROGRESS(hdr) ||
- ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
- MSEC_TO_TICK(min_lifetime))) {
+ MSEC_TO_TICK(min_lifetime)) {
ARCSTAT_BUMP(arcstat_evict_skip);
return (bytes_evicted);
}
- ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
- while (hdr->b_l1hdr.b_buf) {
- arc_buf_t *buf = hdr->b_l1hdr.b_buf;
- if (!mutex_tryenter(&buf->b_evict_lock)) {
- ARCSTAT_BUMP(arcstat_mutex_miss);
- break;
- }
- if (buf->b_data != NULL) {
- bytes_evicted += HDR_GET_LSIZE(hdr);
- *real_evicted += HDR_GET_LSIZE(hdr);
- }
- mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy_impl(buf);
- }
-
if (HDR_HAS_L2HDR(hdr)) {
ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
} else {
@@ -3991,28 +3821,27 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
}
}
- if (hdr->b_l1hdr.b_bufcnt == 0) {
- arc_cksum_free(hdr);
-
- bytes_evicted += arc_hdr_size(hdr);
- *real_evicted += arc_hdr_size(hdr);
+ bytes_evicted += arc_hdr_size(hdr);
+ *real_evicted += arc_hdr_size(hdr);
- /*
- * If this hdr is being evicted and has a compressed
- * buffer then we discard it here before we change states.
- * This ensures that the accounting is updated correctly
- * in arc_free_data_impl().
- */
- if (hdr->b_l1hdr.b_pabd != NULL)
- arc_hdr_free_abd(hdr, B_FALSE);
+ /*
+ * If this hdr is being evicted and has a compressed buffer then we
+ * discard it here before we change states. This ensures that the
+ * accounting is updated correctly in arc_free_data_impl().
+ */
+ if (hdr->b_l1hdr.b_pabd != NULL)
+ arc_hdr_free_abd(hdr, B_FALSE);
- if (HDR_HAS_RABD(hdr))
- arc_hdr_free_abd(hdr, B_TRUE);
+ if (HDR_HAS_RABD(hdr))
+ arc_hdr_free_abd(hdr, B_TRUE);
- arc_change_state(evicted_state, hdr, hash_lock);
+ arc_change_state(evicted_state, hdr);
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+ if (evicted_state == arc_anon) {
+ arc_hdr_destroy(hdr);
+ *real_evicted += HDR_FULL_SIZE;
+ } else {
ASSERT(HDR_IN_HASH_TABLE(hdr));
- arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
}
return (bytes_evicted);
@@ -4040,15 +3869,15 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t bytes_evicted = 0, real_evicted = 0;
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
- int evict_count = zfs_arc_evict_batch_limit;
+ uint_t evict_count = zfs_arc_evict_batch_limit;
ASSERT3P(marker, !=, NULL);
- mls = multilist_sublist_lock(ml, idx);
+ mls = multilist_sublist_lock_idx(ml, idx);
for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
hdr = multilist_sublist_prev(mls, marker)) {
- if ((evict_count <= 0) || (bytes_evicted >= bytes))
+ if ((evict_count == 0) || (bytes_evicted >= bytes))
break;
/*
@@ -4097,8 +3926,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
if (mutex_tryenter(hash_lock)) {
uint64_t revicted;
- uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
- &revicted);
+ uint64_t evicted = arc_evict_hdr(hdr, &revicted);
mutex_exit(hash_lock);
bytes_evicted += evicted;
@@ -4152,11 +3980,54 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* this CPU are able to make progress, make a voluntary preemption
* call here.
*/
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
return (bytes_evicted);
}
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+ arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+ /*
+ * A b_spa of 0 is used to indicate that this header is
+ * a marker. This fact is used in arc_evict_state_impl().
+ */
+ marker->b_spa = 0;
+
+ return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+ kmem_cache_free(hdr_full_cache, marker);
+}
+
+/*
+ * Allocate an array of buffer headers used as placeholders during arc state
+ * eviction.
+ */
+static arc_buf_hdr_t **
+arc_state_alloc_markers(int count)
+{
+ arc_buf_hdr_t **markers;
+
+ markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
+ for (int i = 0; i < count; i++)
+ markers[i] = arc_state_alloc_marker();
+ return (markers);
+}
+
+static void
+arc_state_free_markers(arc_buf_hdr_t **markers, int count)
+{
+ for (int i = 0; i < count; i++)
+ arc_state_free_marker(markers[i]);
+ kmem_free(markers, sizeof (*markers) * count);
+}
+
/*
* Evict buffers from the given arc state, until we've removed the
* specified number of bytes. Move the removed buffers to the
@@ -4171,8 +4042,8 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* the given arc state; which is used by arc_flush().
*/
static uint64_t
-arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
- arc_buf_contents_t type)
+arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
+ uint64_t bytes)
{
uint64_t total_evicted = 0;
multilist_t *ml = &state->arcs_list[type];
@@ -4188,20 +4059,16 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
* pick up where we left off for each individual sublist, rather
* than starting from the tail each time.
*/
- markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+ if (zthr_iscurthread(arc_evict_zthr)) {
+ markers = arc_state_evict_markers;
+ ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
+ } else {
+ markers = arc_state_alloc_markers(num_sublists);
+ }
for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls;
- markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
- /*
- * A b_spa of 0 is used to indicate that this header is
- * a marker. This fact is used in arc_evict_type() and
- * arc_evict_state_impl().
- */
- markers[i]->b_spa = 0;
-
- mls = multilist_sublist_lock(ml, i);
+ mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_insert_tail(mls, markers[i]);
multilist_sublist_unlock(mls);
}
@@ -4215,19 +4082,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
uint64_t scan_evicted = 0;
/*
- * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
- * Request that 10% of the LRUs be scanned by the superblock
- * shrinker.
- */
- if (type == ARC_BUFC_DATA && aggsum_compare(
- &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) {
- arc_prune_async((aggsum_upper_bound(
- &arc_sums.arcstat_dnode_size) -
- arc_dnode_size_limit) / sizeof (dnode_t) /
- zfs_arc_dnode_reduce_percent);
- }
-
- /*
* Start eviction using a randomly selected sublist,
* this is to try and evenly balance eviction across all
* sublists. Always starting at the same sublist
@@ -4279,13 +4133,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes,
}
for (int i = 0; i < num_sublists; i++) {
- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
-
- kmem_cache_free(hdr_full_cache, markers[i]);
}
- kmem_free(markers, sizeof (*markers) * num_sublists);
+ if (markers != arc_state_evict_markers)
+ arc_state_free_markers(markers, num_sublists);
return (total_evicted);
}
@@ -4312,7 +4165,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
uint64_t evicted = 0;
while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
- evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+ evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
if (!retry)
break;
@@ -4322,252 +4175,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
}
/*
- * Evict the specified number of bytes from the state specified,
- * restricting eviction to the spa and type given. This function
- * prevents us from trying to evict more from a state's list than
- * is "evictable", and to skip evicting altogether when passed a
+ * Evict the specified number of bytes from the state specified. This
+ * function prevents us from trying to evict more from a state's list
+ * than is "evictable", and to skip evicting altogether when passed a
* negative value for "bytes". In contrast, arc_evict_state() will
* evict everything it can, when passed a negative value for "bytes".
*/
static uint64_t
-arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
- arc_buf_contents_t type)
+arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
{
uint64_t delta;
if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
bytes);
- return (arc_evict_state(state, spa, delta, type));
+ return (arc_evict_state(state, type, 0, delta));
}
return (0);
}
/*
- * The goal of this function is to evict enough meta data buffers from the
- * ARC in order to enforce the arc_meta_limit. Achieving this is slightly
- * more complicated than it appears because it is common for data buffers
- * to have holds on meta data buffers. In addition, dnode meta data buffers
- * will be held by the dnodes in the block preventing them from being freed.
- * This means we can't simply traverse the ARC and expect to always find
- * enough unheld meta data buffer to release.
- *
- * Therefore, this function has been updated to make alternating passes
- * over the ARC releasing data buffers and then newly unheld meta data
- * buffers. This ensures forward progress is maintained and meta_used
- * will decrease. Normally this is sufficient, but if required the ARC
- * will call the registered prune callbacks causing dentry and inodes to
- * be dropped from the VFS cache. This will make dnode meta data buffers
- * available for reclaim.
+ * Adjust specified fraction, taking into account initial ghost state(s) size,
+ * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
+ * decreasing it, plus a balance factor, controlling the decrease rate, used
+ * to balance metadata vs data.
*/
static uint64_t
-arc_evict_meta_balanced(uint64_t meta_used)
+arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
+ uint_t balance)
{
- int64_t delta, prune = 0, adjustmnt;
- uint64_t total_evicted = 0;
- arc_buf_contents_t type = ARC_BUFC_DATA;
- int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
+ if (total < 8 || up + down == 0)
+ return (frac);
-restart:
/*
- * This slightly differs than the way we evict from the mru in
- * arc_evict because we don't have a "target" value (i.e. no
- * "meta" arc_p). As a result, I think we can completely
- * cannibalize the metadata in the MRU before we evict the
- * metadata from the MFU. I think we probably need to implement a
- * "metadata arc_p" value to do this properly.
+ * We should not have more ghost hits than ghost size, but they
+ * may get close. Restrict maximum adjustment in that case.
*/
- adjustmnt = meta_used - arc_meta_limit;
-
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
- delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
- adjustmnt);
- total_evicted += arc_evict_impl(arc_mru, 0, delta, type);
- adjustmnt -= delta;
+ if (up + down >= total / 4) {
+ uint64_t scale = (up + down) / (total / 8);
+ up /= scale;
+ down /= scale;
}
- /*
- * We can't afford to recalculate adjustmnt here. If we do,
- * new metadata buffers can sneak into the MRU or ANON lists,
- * thus penalize the MFU metadata. Although the fudge factor is
- * small, it has been empirically shown to be significant for
- * certain workloads (e.g. creating many empty directories). As
- * such, we use the original calculation for adjustmnt, and
- * simply decrement the amount of data evicted from the MRU.
- */
+ /* Get maximal dynamic range by choosing optimal shifts. */
+ int s = highbit64(total);
+ s = MIN(64 - s, 32);
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
- delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
- adjustmnt);
- total_evicted += arc_evict_impl(arc_mfu, 0, delta, type);
- }
+ uint64_t ofrac = (1ULL << 32) - frac;
- adjustmnt = meta_used - arc_meta_limit;
+ if (frac >= 4 * ofrac)
+ up /= frac / (2 * ofrac + 1);
+ up = (up << s) / (total >> (32 - s));
+ if (ofrac >= 4 * frac)
+ down /= ofrac / (2 * frac + 1);
+ down = (down << s) / (total >> (32 - s));
+ down = down * 100 / balance;
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
- delta = MIN(adjustmnt,
- zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
- total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type);
- adjustmnt -= delta;
- }
-
- if (adjustmnt > 0 &&
- zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
- delta = MIN(adjustmnt,
- zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
- total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type);
- }
-
- /*
- * If after attempting to make the requested adjustment to the ARC
- * the meta limit is still being exceeded then request that the
- * higher layers drop some cached objects which have holds on ARC
- * meta buffers. Requests to the upper layers will be made with
- * increasingly large scan sizes until the ARC is below the limit.
- */
- if (meta_used > arc_meta_limit) {
- if (type == ARC_BUFC_DATA) {
- type = ARC_BUFC_METADATA;
- } else {
- type = ARC_BUFC_DATA;
-
- if (zfs_arc_meta_prune) {
- prune += zfs_arc_meta_prune;
- arc_prune_async(prune);
- }
- }
-
- if (restarts > 0) {
- restarts--;
- goto restart;
- }
- }
- return (total_evicted);
-}
-
-/*
- * Evict metadata buffers from the cache, such that arcstat_meta_used is
- * capped by the arc_meta_limit tunable.
- */
-static uint64_t
-arc_evict_meta_only(uint64_t meta_used)
-{
- uint64_t total_evicted = 0;
- int64_t target;
-
- /*
- * If we're over the meta limit, we want to evict enough
- * metadata to get back under the meta limit. We don't want to
- * evict so much that we drop the MRU below arc_p, though. If
- * we're over the meta limit more than we're over arc_p, we
- * evict some from the MRU here, and some from the MFU below.
- */
- target = MIN((int64_t)(meta_used - arc_meta_limit),
- (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
- zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
-
- total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
-
- /*
- * Similar to the above, we want to evict enough bytes to get us
- * below the meta limit, but not so much as to drop us below the
- * space allotted to the MFU (which is defined as arc_c - arc_p).
- */
- target = MIN((int64_t)(meta_used - arc_meta_limit),
- (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
- (arc_c - arc_p)));
-
- total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
-
- return (total_evicted);
-}
-
-static uint64_t
-arc_evict_meta(uint64_t meta_used)
-{
- if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
- return (arc_evict_meta_only(meta_used));
- else
- return (arc_evict_meta_balanced(meta_used));
-}
-
-/*
- * Return the type of the oldest buffer in the given arc state
- *
- * This function will select a random sublist of type ARC_BUFC_DATA and
- * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
- * is compared, and the type which contains the "older" buffer will be
- * returned.
- */
-static arc_buf_contents_t
-arc_evict_type(arc_state_t *state)
-{
- multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
- multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
- int data_idx = multilist_get_random_index(data_ml);
- int meta_idx = multilist_get_random_index(meta_ml);
- multilist_sublist_t *data_mls;
- multilist_sublist_t *meta_mls;
- arc_buf_contents_t type;
- arc_buf_hdr_t *data_hdr;
- arc_buf_hdr_t *meta_hdr;
-
- /*
- * We keep the sublist lock until we're finished, to prevent
- * the headers from being destroyed via arc_evict_state().
- */
- data_mls = multilist_sublist_lock(data_ml, data_idx);
- meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
-
- /*
- * These two loops are to ensure we skip any markers that
- * might be at the tail of the lists due to arc_evict_state().
- */
-
- for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
- data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
- if (data_hdr->b_spa != 0)
- break;
- }
-
- for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
- meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
- if (meta_hdr->b_spa != 0)
- break;
- }
-
- if (data_hdr == NULL && meta_hdr == NULL) {
- type = ARC_BUFC_DATA;
- } else if (data_hdr == NULL) {
- ASSERT3P(meta_hdr, !=, NULL);
- type = ARC_BUFC_METADATA;
- } else if (meta_hdr == NULL) {
- ASSERT3P(data_hdr, !=, NULL);
- type = ARC_BUFC_DATA;
- } else {
- ASSERT3P(data_hdr, !=, NULL);
- ASSERT3P(meta_hdr, !=, NULL);
-
- /* The headers can't be on the sublist without an L1 header */
- ASSERT(HDR_HAS_L1HDR(data_hdr));
- ASSERT(HDR_HAS_L1HDR(meta_hdr));
-
- if (data_hdr->b_l1hdr.b_arc_access <
- meta_hdr->b_l1hdr.b_arc_access) {
- type = ARC_BUFC_DATA;
- } else {
- type = ARC_BUFC_METADATA;
- }
- }
-
- multilist_sublist_unlock(meta_mls);
- multilist_sublist_unlock(data_mls);
-
- return (type);
+ return (frac + up - down);
}
/*
@@ -4576,150 +4241,128 @@ arc_evict_type(arc_state_t *state)
static uint64_t
arc_evict(void)
{
- uint64_t total_evicted = 0;
- uint64_t bytes;
- int64_t target;
- uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
- uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used);
-
- /*
- * If we're over arc_meta_limit, we want to correct that before
- * potentially evicting data buffers below.
- */
- total_evicted += arc_evict_meta(ameta);
-
- /*
- * Adjust MRU size
- *
- * If we're over the target cache size, we want to evict enough
- * from the list to get back to our target size. We don't want
- * to evict too much from the MRU, such that it drops below
- * arc_p. So, if we're over our target cache size more than
- * the MRU is over arc_p, we'll evict enough to get back to
- * arc_p here, and then evict more from the MFU below.
- */
- target = MIN((int64_t)(asize - arc_c),
- (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
- zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
-
- /*
- * If we're below arc_meta_min, always prefer to evict data.
- * Otherwise, try to satisfy the requested number of bytes to
- * evict from the type which contains older buffers; in an
- * effort to keep newer buffers in the cache regardless of their
- * type. If we cannot satisfy the number of bytes from this
- * type, spill over into the next type.
- */
- if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA &&
- ameta > arc_meta_min) {
- bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * metadata, we try to get the rest from data.
- */
- target -= bytes;
-
- total_evicted +=
- arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
- } else {
- bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * data, we try to get the rest from metadata.
- */
- target -= bytes;
-
- total_evicted +=
- arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
- }
+ uint64_t asize, bytes, total_evicted = 0;
+ int64_t e, mrud, mrum, mfud, mfum, w;
+ static uint64_t ogrd, ogrm, ogfd, ogfm;
+ static uint64_t gsrd, gsrm, gsfd, gsfm;
+ uint64_t ngrd, ngrm, ngfd, ngfm;
+
+ /* Get current size of ARC states we can evict from. */
+ mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
+ zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
+ mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
+ zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
+ mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
+ mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
+ uint64_t d = mrud + mfud;
+ uint64_t m = mrum + mfum;
+ uint64_t t = d + m;
+
+ /* Get ARC ghost hits since last eviction. */
+ ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
+ uint64_t grd = ngrd - ogrd;
+ ogrd = ngrd;
+ ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
+ uint64_t grm = ngrm - ogrm;
+ ogrm = ngrm;
+ ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
+ uint64_t gfd = ngfd - ogfd;
+ ogfd = ngfd;
+ ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
+ uint64_t gfm = ngfm - ogfm;
+ ogfm = ngfm;
+
+ /* Adjust ARC states balance based on ghost hits. */
+ arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
+ grm + gfm, grd + gfd, zfs_arc_meta_balance);
+ arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
+ arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
- /*
- * Re-sum ARC stats after the first round of evictions.
- */
asize = aggsum_value(&arc_sums.arcstat_size);
- ameta = aggsum_value(&arc_sums.arcstat_meta_used);
-
-
- /*
- * Adjust MFU size
- *
- * Now that we've tried to evict enough from the MRU to get its
- * size back to arc_p, if we're still above the target cache
- * size, we evict the rest from the MFU.
- */
- target = asize - arc_c;
-
- if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA &&
- ameta > arc_meta_min) {
- bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * metadata, we try to get the rest from data.
- */
- target -= bytes;
-
- total_evicted +=
- arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
- } else {
- bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
- total_evicted += bytes;
-
- /*
- * If we couldn't evict our target number of bytes from
- * data, we try to get the rest from data.
- */
- target -= bytes;
-
- total_evicted +=
- arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
- }
-
- /*
- * Adjust ghost lists
- *
- * In addition to the above, the ARC also defines target values
- * for the ghost lists. The sum of the mru list and mru ghost
- * list should never exceed the target size of the cache, and
- * the sum of the mru list, mfu list, mru ghost list, and mfu
- * ghost list should never exceed twice the target size of the
- * cache. The following logic enforces these limits on the ghost
- * caches, and evicts from them as needed.
- */
- target = zfs_refcount_count(&arc_mru->arcs_size) +
- zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
-
- bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+ int64_t wt = t - (asize - arc_c);
+
+ /*
+ * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
+ * target is not evictable or if they go over arc_dnode_limit.
+ */
+ int64_t prune = 0;
+ int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
+ w = wt * (int64_t)(arc_meta >> 16) >> 16;
+ if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
+ zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) -
+ zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) -
+ zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) >
+ w * 3 / 4) {
+ prune = dn / sizeof (dnode_t) *
+ zfs_arc_dnode_reduce_percent / 100;
+ } else if (dn > arc_dnode_limit) {
+ prune = (dn - arc_dnode_limit) / sizeof (dnode_t) *
+ zfs_arc_dnode_reduce_percent / 100;
+ }
+ if (prune > 0)
+ arc_prune_async(prune);
+
+ /* Evict MRU metadata. */
+ w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
+ e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w));
+ bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
total_evicted += bytes;
+ mrum -= bytes;
+ asize -= bytes;
- target -= bytes;
+ /* Evict MFU metadata. */
+ w = wt * (int64_t)(arc_meta >> 16) >> 16;
+ e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w));
+ bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
+ total_evicted += bytes;
+ mfum -= bytes;
+ asize -= bytes;
+
+ /* Evict MRU data. */
+ wt -= m - total_evicted;
+ w = wt * (int64_t)(arc_pd >> 16) >> 16;
+ e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w));
+ bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
+ total_evicted += bytes;
+ mrud -= bytes;
+ asize -= bytes;
- total_evicted +=
- arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
+ /* Evict MFU data. */
+ e = asize - arc_c;
+ bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
+ mfud -= bytes;
+ total_evicted += bytes;
/*
- * We assume the sum of the mru list and mfu list is less than
- * or equal to arc_c (we enforced this above), which means we
- * can use the simpler of the two equations below:
+ * Evict ghost lists
*
- * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
- * mru ghost + mfu ghost <= arc_c
- */
- target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
- zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
-
- bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
- total_evicted += bytes;
-
- target -= bytes;
-
- total_evicted +=
- arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+ * Size of each state's ghost list represents how much that state
+ * may grow by shrinking the other states. Would it need to shrink
+ * other states to zero (that is unlikely), its ghost size would be
+ * equal to sum of other three state sizes. But excessive ghost
+ * size may result in false ghost hits (too far back), that may
+ * never result in real cache hits if several states are competing.
+ * So choose some arbitraty point of 1/2 of other state sizes.
+ */
+ gsrd = (mrum + mfud + mfum) / 2;
+ e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
+ gsrd;
+ (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
+
+ gsrm = (mrud + mfud + mfum) / 2;
+ e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
+ gsrm;
+ (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
+
+ gsfd = (mrud + mrum + mfum) / 2;
+ e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
+ gsfd;
+ (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
+
+ gsfm = (mrud + mrum + mfud) / 2;
+ e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
+ gsfm;
+ (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
return (total_evicted);
}
@@ -4734,7 +4377,7 @@ arc_flush(spa_t *spa, boolean_t retry)
* no good way to determine if all of a spa's buffers have been
* evicted from an arc state.
*/
- ASSERT(!retry || spa == 0);
+ ASSERT(!retry || spa == NULL);
if (spa != NULL)
guid = spa_load_guid(spa);
@@ -4750,12 +4393,18 @@ arc_flush(spa_t *spa, boolean_t retry)
(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
}
void
arc_reduce_target_size(int64_t to_free)
{
- uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
+ uint64_t c = arc_c;
+
+ if (c <= arc_c_min)
+ return;
/*
* All callers want the ARC to actually evict (at least) this much
@@ -4765,26 +4414,16 @@ arc_reduce_target_size(int64_t to_free)
* immediately have arc_c < arc_size and therefore the arc_evict_zthr
* will evict.
*/
- uint64_t c = MIN(arc_c, asize);
-
- if (c > to_free && c - to_free > arc_c_min) {
- arc_c = c - to_free;
- atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
- if (arc_p > arc_c)
- arc_p = (arc_c >> 1);
- ASSERT(arc_c >= arc_c_min);
- ASSERT((int64_t)arc_p >= 0);
- } else {
- arc_c = arc_c_min;
- }
+ uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
+ if (asize < c)
+ to_free += c - asize;
+ arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min);
- if (asize > arc_c) {
- /* See comment in arc_evict_cb_check() on why lock+flag */
- mutex_enter(&arc_evict_lock);
- arc_evict_needed = B_TRUE;
- mutex_exit(&arc_evict_lock);
- zthr_wakeup(arc_evict_zthr);
- }
+ /* See comment in arc_evict_cb_check() on why lock+flag */
+ mutex_enter(&arc_evict_lock);
+ arc_evict_needed = B_TRUE;
+ mutex_exit(&arc_evict_lock);
+ zthr_wakeup(arc_evict_zthr);
}
/*
@@ -4804,18 +4443,8 @@ arc_kmem_reap_soon(void)
size_t i;
kmem_cache_t *prev_cache = NULL;
kmem_cache_t *prev_data_cache = NULL;
- extern kmem_cache_t *zio_buf_cache[];
- extern kmem_cache_t *zio_data_buf_cache[];
#ifdef _KERNEL
- if ((aggsum_compare(&arc_sums.arcstat_meta_used,
- arc_meta_limit) >= 0) && zfs_arc_meta_prune) {
- /*
- * We are exceeding our meta-data cache limit.
- * Prune some entries to release holds on meta-data.
- */
- arc_prune_async(zfs_arc_meta_prune);
- }
#if defined(_ILP32)
/*
* Reclaim unused memory from all kmem caches.
@@ -4846,10 +4475,11 @@ arc_kmem_reap_soon(void)
abd_cache_reap_now();
}
-/* ARGSUSED */
static boolean_t
arc_evict_cb_check(void *arg, zthr_t *zthr)
{
+ (void) arg, (void) zthr;
+
#ifdef ZFS_DEBUG
/*
* This is necessary in order to keep the kstat information
@@ -4882,22 +4512,38 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
* which is held before this function is called, and is held by
* arc_wait_for_eviction() when it calls zthr_wakeup().
*/
- return (arc_evict_needed);
+ if (arc_evict_needed)
+ return (B_TRUE);
+
+ /*
+ * If we have buffers in uncached state, evict them periodically.
+ */
+ return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
+ zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
+ ddi_get_lbolt() - arc_last_uncached_flush >
+ MSEC_TO_TICK(arc_min_prefetch_ms / 2)));
}
/*
* Keep arc_size under arc_c by running arc_evict which evicts data
* from the ARC.
*/
-/* ARGSUSED */
static void
arc_evict_cb(void *arg, zthr_t *zthr)
{
+ (void) arg;
+
uint64_t evicted = 0;
fstrans_cookie_t cookie = spl_fstrans_mark();
- /* Evict from cache */
- evicted = arc_evict();
+ /* Always try to evict from uncached state. */
+ arc_last_uncached_flush = ddi_get_lbolt();
+ evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
+ evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
+
+ /* Evict from other states only if told to. */
+ if (arc_evict_needed)
+ evicted += arc_evict();
/*
* If evicted is zero, we couldn't evict anything
@@ -4909,9 +4555,13 @@ arc_evict_cb(void *arg, zthr_t *zthr)
* infinite loop. Additionally, zthr_iscancelled() is
* checked here so that if the arc is shutting down, the
* broadcast will wake any remaining arc evict waiters.
+ *
+ * Note we cancel using zthr instead of arc_evict_zthr
+ * because the latter may not yet be initializd when the
+ * callback is first invoked.
*/
mutex_enter(&arc_evict_lock);
- arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
+ arc_evict_needed = !zthr_iscancelled(zthr) &&
evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
if (!arc_evict_needed) {
/*
@@ -4929,10 +4579,11 @@ arc_evict_cb(void *arg, zthr_t *zthr)
spl_fstrans_unmark(cookie);
}
-/* ARGSUSED */
static boolean_t
arc_reap_cb_check(void *arg, zthr_t *zthr)
{
+ (void) arg, (void) zthr;
+
int64_t free_memory = arc_available_memory();
static int reap_cb_check_counter = 0;
@@ -4976,10 +4627,11 @@ arc_reap_cb_check(void *arg, zthr_t *zthr)
* target size of the cache (arc_c), causing the arc_evict_cb()
* to free more buffers.
*/
-/* ARGSUSED */
static void
arc_reap_cb(void *arg, zthr_t *zthr)
{
+ (void) arg, (void) zthr;
+
int64_t free_memory;
fstrans_cookie_t cookie = spl_fstrans_mark();
@@ -5009,10 +4661,11 @@ arc_reap_cb(void *arg, zthr_t *zthr)
*/
free_memory = arc_available_memory();
- int64_t to_free =
- (arc_c >> arc_shrink_shift) - free_memory;
- if (to_free > 0) {
- arc_reduce_target_size(to_free);
+ int64_t can_free = arc_c - arc_c_min;
+ if (can_free > 0) {
+ int64_t to_free = (can_free >> arc_shrink_shift) - free_memory;
+ if (to_free > 0)
+ arc_reduce_target_size(to_free);
}
spl_fstrans_unmark(cookie);
}
@@ -5072,40 +4725,8 @@ arc_reap_cb(void *arg, zthr_t *zthr)
* when we are adding new content to the cache.
*/
static void
-arc_adapt(int bytes, arc_state_t *state)
+arc_adapt(uint64_t bytes)
{
- int mult;
- uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
- int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
- int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
-
- ASSERT(bytes > 0);
- /*
- * Adapt the target size of the MRU list:
- * - if we just hit in the MRU ghost list, then increase
- * the target size of the MRU list.
- * - if we just hit in the MFU ghost list, then increase
- * the target size of the MFU list by decreasing the
- * target size of the MRU list.
- */
- if (state == arc_mru_ghost) {
- mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
- if (!zfs_arc_p_dampener_disable)
- mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
-
- arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
- } else if (state == arc_mfu_ghost) {
- uint64_t delta;
-
- mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
- if (!zfs_arc_p_dampener_disable)
- mult = MIN(mult, 10);
-
- delta = MIN(bytes * mult, arc_p);
- arc_p = MAX(arc_p_min, arc_p - delta);
- }
- ASSERT((int64_t)arc_p >= 0);
-
/*
* Wake reap thread if we do not have any available memory
*/
@@ -5124,18 +4745,12 @@ arc_adapt(int bytes, arc_state_t *state)
* If we're within (2 * maxblocksize) bytes of the target
* cache size, increment the target cache size
*/
- ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
- if (aggsum_upper_bound(&arc_sums.arcstat_size) >=
- arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
- atomic_add_64(&arc_c, (int64_t)bytes);
- if (arc_c > arc_c_max)
+ if (aggsum_upper_bound(&arc_sums.arcstat_size) +
+ 2 * SPA_MAXBLOCKSIZE >= arc_c) {
+ uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
+ if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
arc_c = arc_c_max;
- else if (state == arc_anon)
- atomic_add_64(&arc_p, (int64_t)bytes);
- if (arc_p > arc_c)
- arc_p = arc_c;
}
- ASSERT((int64_t)arc_p >= 0);
}
/*
@@ -5167,26 +4782,24 @@ arc_is_overflowing(boolean_t use_reserve)
}
static abd_t *
-arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
int alloc_flags)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag, alloc_flags);
- if (type == ARC_BUFC_METADATA) {
- return (abd_alloc(size, B_TRUE));
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- return (abd_alloc(size, B_FALSE));
- }
+ if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
+ return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
+ else
+ return (abd_alloc(size, type == ARC_BUFC_METADATA));
}
static void *
-arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT);
+ arc_get_data_impl(hdr, size, tag, 0);
if (type == ARC_BUFC_METADATA) {
return (zio_buf_alloc(size));
} else {
@@ -5281,14 +4894,10 @@ arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve)
* limit, we'll only signal the reclaim thread and continue on.
*/
static void
-arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
int alloc_flags)
{
- arc_state_t *state = hdr->b_l1hdr.b_state;
- arc_buf_contents_t type = arc_buf_type(hdr);
-
- if (alloc_flags & ARC_HDR_DO_ADAPT)
- arc_adapt(size, state);
+ arc_adapt(size);
/*
* If arc_size is currently overflowing, we must be adding data
@@ -5306,7 +4915,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
alloc_flags & ARC_HDR_USE_RESERVE);
- VERIFY3U(hdr->b_type, ==, type);
+ arc_buf_contents_t type = arc_buf_type(hdr);
if (type == ARC_BUFC_METADATA) {
arc_space_consume(size, ARC_SPACE_META);
} else {
@@ -5317,9 +4926,11 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
* Update the state size. Note that ghost states have a
* "ghost size" and so don't need to be updated.
*/
+ arc_state_t *state = hdr->b_l1hdr.b_state;
if (!GHOST_STATE(state)) {
- (void) zfs_refcount_add_many(&state->arcs_size, size, tag);
+ (void) zfs_refcount_add_many(&state->arcs_size[type], size,
+ tag);
/*
* If this is reached via arc_read, the link is
@@ -5335,28 +4946,19 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
(void) zfs_refcount_add_many(&state->arcs_esize[type],
size, tag);
}
-
- /*
- * If we are growing the cache, and we are adding anonymous
- * data, and we have outgrown arc_p, update arc_p
- */
- if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c &&
- hdr->b_l1hdr.b_state == arc_anon &&
- (zfs_refcount_count(&arc_anon->arcs_size) +
- zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
- arc_p = MIN(arc_c, arc_p + size);
}
}
static void
-arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
+ const void *tag)
{
arc_free_data_impl(hdr, size, tag);
abd_free(abd);
}
static void
-arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
@@ -5373,7 +4975,7 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
* Free the arc data buffer.
*/
static void
-arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
@@ -5386,7 +4988,7 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
(void) zfs_refcount_remove_many(&state->arcs_esize[type],
size, tag);
}
- (void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
+ (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
@@ -5399,150 +5001,155 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
/*
* This routine is called whenever a buffer is accessed.
- * NOTE: the hash lock is dropped in this function.
*/
static void
-arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
{
- clock_t now;
-
- ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
ASSERT(HDR_HAS_L1HDR(hdr));
+ /*
+ * Update buffer prefetch status.
+ */
+ boolean_t was_prefetch = HDR_PREFETCH(hdr);
+ boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
+ if (was_prefetch != now_prefetch) {
+ if (was_prefetch) {
+ ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
+ HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
+ prefetch);
+ }
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ if (was_prefetch) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ }
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ if (now_prefetch) {
+ if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+ ARCSTAT_BUMP(arcstat_prescient_prefetch);
+ } else {
+ ARCSTAT_BUMP(arcstat_predictive_prefetch);
+ }
+ }
+ if (arc_flags & ARC_FLAG_L2CACHE)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+
+ clock_t now = ddi_get_lbolt();
if (hdr->b_l1hdr.b_state == arc_anon) {
+ arc_state_t *new_state;
/*
- * This buffer is not in the cache, and does not
- * appear in our "ghost" list. Add the new buffer
- * to the MRU state.
+ * This buffer is not in the cache, and does not appear in
+ * our "ghost" lists. Add it to the MRU or uncached state.
*/
-
ASSERT0(hdr->b_l1hdr.b_arc_access);
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
- arc_change_state(arc_mru, hdr, hash_lock);
-
+ hdr->b_l1hdr.b_arc_access = now;
+ if (HDR_UNCACHED(hdr)) {
+ new_state = arc_uncached;
+ DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
+ hdr);
+ } else {
+ new_state = arc_mru;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+ }
+ arc_change_state(new_state, hdr);
} else if (hdr->b_l1hdr.b_state == arc_mru) {
- now = ddi_get_lbolt();
+ /*
+ * This buffer has been accessed once recently and either
+ * its read is still in progress or it is in the cache.
+ */
+ if (HDR_IO_IN_PROGRESS(hdr)) {
+ hdr->b_l1hdr.b_arc_access = now;
+ return;
+ }
+ hdr->b_l1hdr.b_mru_hits++;
+ ARCSTAT_BUMP(arcstat_mru_hits);
/*
- * If this buffer is here because of a prefetch, then either:
- * - clear the flag if this is a "referencing" read
- * (any subsequent access will bump this into the MFU state).
- * or
- * - move the buffer to the head of the list if this is
- * another prefetch (to make it less likely to be evicted).
+ * If the previous access was a prefetch, then it already
+ * handled possible promotion, so nothing more to do for now.
*/
- if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
- if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
- /* link protected by hash lock */
- ASSERT(multilist_link_active(
- &hdr->b_l1hdr.b_arc_node));
- } else {
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_decrement_state(hdr);
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREFETCH |
- ARC_FLAG_PRESCIENT_PREFETCH);
- hdr->b_l1hdr.b_mru_hits++;
- ARCSTAT_BUMP(arcstat_mru_hits);
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_increment_state(hdr);
- }
+ if (was_prefetch) {
hdr->b_l1hdr.b_arc_access = now;
return;
}
/*
- * This buffer has been "accessed" only once so far,
- * but it is still in the cache. Move it to the MFU
- * state.
+ * If more than ARC_MINTIME have passed from the previous
+ * hit, promote the buffer to the MFU state.
*/
if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
ARC_MINTIME)) {
- /*
- * More than 125ms have passed since we
- * instantiated this buffer. Move it to the
- * most frequently used state.
- */
hdr->b_l1hdr.b_arc_access = now;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
- arc_change_state(arc_mfu, hdr, hash_lock);
+ arc_change_state(arc_mfu, hdr);
}
- hdr->b_l1hdr.b_mru_hits++;
- ARCSTAT_BUMP(arcstat_mru_hits);
} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
arc_state_t *new_state;
/*
- * This buffer has been "accessed" recently, but
- * was evicted from the cache. Move it to the
- * MFU state.
+ * This buffer has been accessed once recently, but was
+ * evicted from the cache. Would we have bigger MRU, it
+ * would be an MRU hit, so handle it the same way, except
+ * we don't need to check the previous access time.
*/
- if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+ hdr->b_l1hdr.b_mru_ghost_hits++;
+ ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ hdr->b_l1hdr.b_arc_access = now;
+ wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
+ arc_hdr_size(hdr));
+ if (was_prefetch) {
new_state = arc_mru;
- if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_decrement_state(hdr);
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREFETCH |
- ARC_FLAG_PRESCIENT_PREFETCH);
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_increment_state(hdr);
- }
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
new_state = arc_mfu;
DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
}
-
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- arc_change_state(new_state, hdr, hash_lock);
-
- hdr->b_l1hdr.b_mru_ghost_hits++;
- ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+ arc_change_state(new_state, hdr);
} else if (hdr->b_l1hdr.b_state == arc_mfu) {
/*
- * This buffer has been accessed more than once and is
- * still in the cache. Keep it in the MFU state.
- *
- * NOTE: an add_reference() that occurred when we did
- * the arc_read() will have kicked this off the list.
- * If it was a prefetch, we will explicitly move it to
- * the head of the list now.
+ * This buffer has been accessed more than once and either
+ * still in the cache or being restored from one of ghosts.
*/
-
- hdr->b_l1hdr.b_mfu_hits++;
- ARCSTAT_BUMP(arcstat_mfu_hits);
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ if (!HDR_IO_IN_PROGRESS(hdr)) {
+ hdr->b_l1hdr.b_mfu_hits++;
+ ARCSTAT_BUMP(arcstat_mfu_hits);
+ }
+ hdr->b_l1hdr.b_arc_access = now;
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
- arc_state_t *new_state = arc_mfu;
/*
- * This buffer has been accessed more than once but has
- * been evicted from the cache. Move it back to the
- * MFU state.
+ * This buffer has been accessed more than once recently, but
+ * has been evicted from the cache. Would we have bigger MFU
+ * it would stay in cache, so move it back to MFU state.
*/
-
- if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
- /*
- * This is a prefetch access...
- * move this block back to the MRU state.
- */
- new_state = arc_mru;
- }
-
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
- arc_change_state(new_state, hdr, hash_lock);
-
hdr->b_l1hdr.b_mfu_ghost_hits++;
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ hdr->b_l1hdr.b_arc_access = now;
+ wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
+ arc_hdr_size(hdr));
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mfu, hdr);
+ } else if (hdr->b_l1hdr.b_state == arc_uncached) {
+ /*
+ * This buffer is uncacheable, but we got a hit. Probably
+ * a demand read after prefetch. Nothing more to do here.
+ */
+ if (!HDR_IO_IN_PROGRESS(hdr))
+ ARCSTAT_BUMP(arcstat_uncached_hits);
+ hdr->b_l1hdr.b_arc_access = now;
} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
/*
- * This buffer is on the 2nd Level ARC.
+ * This buffer is on the 2nd Level ARC and was not accessed
+ * for a long time, so treat it as new and put into MRU.
*/
-
- hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
- arc_change_state(arc_mfu, hdr, hash_lock);
+ hdr->b_l1hdr.b_arc_access = now;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mru, hdr);
} else {
cmn_err(CE_PANIC, "invalid arc state 0x%p",
hdr->b_l1hdr.b_state);
@@ -5556,7 +5163,6 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
void
arc_buf_access(arc_buf_t *buf)
{
- mutex_enter(&buf->b_evict_lock);
arc_buf_hdr_t *hdr = buf->b_hdr;
/*
@@ -5564,54 +5170,51 @@ arc_buf_access(arc_buf_t *buf)
* The header must be checked again under the hash_lock in order
* to handle the case where it is concurrently being released.
*/
- if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
- mutex_exit(&buf->b_evict_lock);
+ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
return;
- }
kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
mutex_exit(hash_lock);
- mutex_exit(&buf->b_evict_lock);
ARCSTAT_BUMP(arcstat_access_skip);
return;
}
- mutex_exit(&buf->b_evict_lock);
-
ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
- hdr->b_l1hdr.b_state == arc_mfu);
+ hdr->b_l1hdr.b_state == arc_mfu ||
+ hdr->b_l1hdr.b_state == arc_uncached);
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
- arc_access(hdr, hash_lock);
+ arc_access(hdr, 0, B_TRUE);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
- demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
+ ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
+ !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
}
/* a generic arc_read_done_func_t which you can use */
-/* ARGSUSED */
void
arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *arg)
{
+ (void) zio, (void) zb, (void) bp;
+
if (buf == NULL)
return;
- bcopy(buf->b_data, arg, arc_buf_size(buf));
+ memcpy(arg, buf->b_data, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
/* a generic arc_read_done_func_t */
-/* ARGSUSED */
void
arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *arg)
{
+ (void) zb, (void) bp;
arc_buf_t **bufp = arg;
if (buf == NULL) {
@@ -5649,7 +5252,6 @@ arc_read_done(zio_t *zio)
kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list;
arc_callback_t *acb;
- boolean_t freeable = B_FALSE;
/*
* The hdr was inserted into hash-table and removed from lists
@@ -5662,7 +5264,7 @@ arc_read_done(zio_t *zio)
if (HDR_IN_HASH_TABLE(hdr)) {
arc_buf_hdr_t *found;
- ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+ ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
ASSERT3U(hdr->b_dva.dva_word[0], ==,
BP_IDENTITY(zio->io_bp)->dva_word[0]);
ASSERT3U(hdr->b_dva.dva_word[1], ==,
@@ -5682,17 +5284,20 @@ arc_read_done(zio_t *zio)
zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
hdr->b_crypt_hdr.b_iv);
- if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
- void *tmpbuf;
-
- tmpbuf = abd_borrow_buf_copy(zio->io_abd,
- sizeof (zil_chain_t));
- zio_crypt_decode_mac_zil(tmpbuf,
- hdr->b_crypt_hdr.b_mac);
- abd_return_buf(zio->io_abd, tmpbuf,
- sizeof (zil_chain_t));
- } else {
- zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+ if (zio->io_error == 0) {
+ if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
+ void *tmpbuf;
+
+ tmpbuf = abd_borrow_buf_copy(zio->io_abd,
+ sizeof (zil_chain_t));
+ zio_crypt_decode_mac_zil(tmpbuf,
+ hdr->b_crypt_hdr.b_mac);
+ abd_return_buf(zio->io_abd, tmpbuf,
+ sizeof (zil_chain_t));
+ } else {
+ zio_crypt_decode_mac_bp(bp,
+ hdr->b_crypt_hdr.b_mac);
+ }
}
}
@@ -5719,17 +5324,7 @@ arc_read_done(zio_t *zio)
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
-
- if (hash_lock && zio->io_error == 0 &&
- hdr->b_l1hdr.b_state == arc_anon) {
- /*
- * Only call arc_access on anonymous buffers. This is because
- * if we've issued an I/O for an evicted buffer, we've already
- * called arc_access (to prevent any simultaneous readers from
- * getting confused).
- */
- arc_access(hdr, hash_lock);
- }
+ hdr->b_l1hdr.b_acb = NULL;
/*
* If a read request has a callback (i.e. acb_done is not NULL), then we
@@ -5739,6 +5334,10 @@ arc_read_done(zio_t *zio)
*/
int callback_cnt = 0;
for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
+
+ /* We need the last one to call below in original order. */
+ callback_list = acb;
+
if (!acb->acb_done || acb->acb_nobuf)
continue;
@@ -5767,7 +5366,8 @@ arc_read_done(zio_t *zio)
ASSERT(BP_IS_PROTECTED(bp));
error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
- spa_log_error(zio->io_spa, &acb->acb_zb);
+ spa_log_error(zio->io_spa, &acb->acb_zb,
+ BP_GET_LOGICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
zio->io_spa, NULL, &acb->acb_zb, zio, 0);
@@ -5802,44 +5402,21 @@ arc_read_done(zio_t *zio)
*/
ASSERT(callback_cnt < 2 || hash_lock != NULL);
- hdr->b_l1hdr.b_acb = NULL;
- arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- if (callback_cnt == 0)
- ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
-
- ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
- callback_list != NULL);
-
if (zio->io_error == 0) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
if (hdr->b_l1hdr.b_state != arc_anon)
- arc_change_state(arc_anon, hdr, hash_lock);
+ arc_change_state(arc_anon, hdr);
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
- freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
- /*
- * Broadcast before we drop the hash_lock to avoid the possibility
- * that the hdr (and hence the cv) might be freed before we get to
- * the cv_broadcast().
- */
- cv_broadcast(&hdr->b_l1hdr.b_cv);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ (void) remove_reference(hdr, hdr);
- if (hash_lock != NULL) {
+ if (hash_lock != NULL)
mutex_exit(hash_lock);
- } else {
- /*
- * This block was freed while we waited for the read to
- * complete. It has been removed from the hash table and
- * moved to the anonymous state (so that it won't show up
- * in the cache).
- */
- ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
- freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
- }
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
@@ -5863,12 +5440,18 @@ arc_read_done(zio_t *zio)
zio_nowait(acb->acb_zio_dummy);
}
- callback_list = acb->acb_next;
- kmem_free(acb, sizeof (arc_callback_t));
+ callback_list = acb->acb_prev;
+ if (acb->acb_wait) {
+ mutex_enter(&acb->acb_wait_lock);
+ acb->acb_wait_error = zio->io_error;
+ acb->acb_wait = B_FALSE;
+ cv_signal(&acb->acb_wait_cv);
+ mutex_exit(&acb->acb_wait_lock);
+ /* acb will be freed by the waiting thread. */
+ } else {
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
}
-
- if (freeable)
- arc_hdr_destroy(hdr);
}
/*
@@ -5905,6 +5488,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
(zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
+ arc_buf_t *buf = NULL;
int rc = 0;
ASSERT(!embedded_bp ||
@@ -5931,10 +5515,10 @@ top:
* and treat it as a checksum error. This allows an alternate blkptr
* to be tried when one is available (e.g. ditto blocks).
*/
- if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER,
- BLK_VERIFY_LOG)) {
+ if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
+ BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
rc = SET_ERROR(ECKSUM);
- goto out;
+ goto done;
}
if (!embedded_bp) {
@@ -5954,19 +5538,17 @@ top:
*/
if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
(hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
- arc_buf_t *buf = NULL;
- *arc_flags |= ARC_FLAG_CACHED;
+ boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
if (HDR_IO_IN_PROGRESS(hdr)) {
- zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
-
if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_cached_only_in_progress);
rc = SET_ERROR(ENOENT);
- goto out;
+ goto done;
}
+ zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
ASSERT3P(head_zio, !=, NULL);
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
@@ -5980,21 +5562,28 @@ top:
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_async_upgrade_sync);
}
- if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREDICTIVE_PREFETCH);
- }
- if (*arc_flags & ARC_FLAG_WAIT) {
- cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
- mutex_exit(hash_lock);
- goto top;
- }
- ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
-
- if (done) {
- arc_callback_t *acb = NULL;
+ DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, *arc_flags, B_FALSE);
+ /*
+ * If there are multiple threads reading the same block
+ * and that block is not yet in the ARC, then only one
+ * thread will do the physical I/O and all other
+ * threads will wait until that I/O completes.
+ * Synchronous reads use the acb_wait_cv whereas nowait
+ * reads register a callback. Both are signalled/called
+ * in arc_read_done.
+ *
+ * Errors of the physical I/O may need to be propagated.
+ * Synchronous read errors are returned here from
+ * arc_read_done via acb_wait_error. Nowait reads
+ * attach the acb_zio_dummy zio to pio and
+ * arc_read_done propagates the physical I/O's io_error
+ * to acb_zio_dummy, and thereby to pio.
+ */
+ arc_callback_t *acb = NULL;
+ if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP);
acb->acb_done = done;
@@ -6003,46 +5592,52 @@ top:
acb->acb_encrypted = encrypted_read;
acb->acb_noauth = noauth_read;
acb->acb_nobuf = no_buf;
+ if (*arc_flags & ARC_FLAG_WAIT) {
+ acb->acb_wait = B_TRUE;
+ mutex_init(&acb->acb_wait_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ cv_init(&acb->acb_wait_cv, NULL,
+ CV_DEFAULT, NULL);
+ }
acb->acb_zb = *zb;
- if (pio != NULL)
+ if (pio != NULL) {
acb->acb_zio_dummy = zio_null(pio,
spa, NULL, NULL, NULL, zio_flags);
-
- ASSERT3P(acb->acb_done, !=, NULL);
+ }
acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
+ hdr->b_l1hdr.b_acb->acb_prev = acb;
hdr->b_l1hdr.b_acb = acb;
}
mutex_exit(hash_lock);
+
+ ARCSTAT_BUMP(arcstat_iohits);
+ ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
+ demand, prefetch, is_data, data, metadata, iohits);
+
+ if (*arc_flags & ARC_FLAG_WAIT) {
+ mutex_enter(&acb->acb_wait_lock);
+ while (acb->acb_wait) {
+ cv_wait(&acb->acb_wait_cv,
+ &acb->acb_wait_lock);
+ }
+ rc = acb->acb_wait_error;
+ mutex_exit(&acb->acb_wait_lock);
+ mutex_destroy(&acb->acb_wait_lock);
+ cv_destroy(&acb->acb_wait_cv);
+ kmem_free(acb, sizeof (arc_callback_t));
+ }
goto out;
}
ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
- hdr->b_l1hdr.b_state == arc_mfu);
+ hdr->b_l1hdr.b_state == arc_mfu ||
+ hdr->b_l1hdr.b_state == arc_uncached);
- if (done && !no_buf) {
- if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
- /*
- * This is a demand read which does not have to
- * wait for i/o because we did a predictive
- * prefetch i/o for it, which has completed.
- */
- DTRACE_PROBE1(
- arc__demand__hit__predictive__prefetch,
- arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(
- arcstat_demand_hit_predictive_prefetch);
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PREDICTIVE_PREFETCH);
- }
-
- if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
- ARCSTAT_BUMP(
- arcstat_demand_hit_prescient_prefetch);
- arc_hdr_clear_flags(hdr,
- ARC_FLAG_PRESCIENT_PREFETCH);
- }
+ DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+ arc_access(hdr, *arc_flags, B_TRUE);
+ if (done && !no_buf) {
ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
/* Get a buf with the desired data in it. */
@@ -6057,44 +5652,28 @@ top:
*/
rc = SET_ERROR(EIO);
if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
- spa_log_error(spa, zb);
+ spa_log_error(spa, zb, hdr->b_birth);
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0);
}
}
if (rc != 0) {
- (void) remove_reference(hdr, hash_lock,
- private);
arc_buf_destroy_impl(buf);
buf = NULL;
+ (void) remove_reference(hdr, private);
}
/* assert any errors weren't due to unloaded keys */
ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
rc != EACCES);
- } else if (*arc_flags & ARC_FLAG_PREFETCH &&
- zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_decrement_state(hdr);
- arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_increment_state(hdr);
}
- DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
- arc_access(hdr, hash_lock);
- if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
- arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
- if (*arc_flags & ARC_FLAG_L2CACHE)
- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
- demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
- data, metadata, hits);
-
- if (done)
- done(NULL, zb, bp, buf, private);
+ ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
+ demand, prefetch, is_data, data, metadata, hits);
+ *arc_flags |= ARC_FLAG_CACHED;
+ goto done;
} else {
uint64_t lsize = BP_GET_LSIZE(bp);
uint64_t psize = BP_GET_PSIZE(bp);
@@ -6105,12 +5684,13 @@ top:
uint64_t size;
abd_t *hdr_abd;
int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
- rc = SET_ERROR(ENOENT);
if (hash_lock != NULL)
mutex_exit(hash_lock);
- goto out;
+ rc = SET_ERROR(ENOENT);
+ goto done;
}
if (hdr == NULL) {
@@ -6119,13 +5699,12 @@ top:
* embedded data.
*/
arc_buf_hdr_t *exists = NULL;
- arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
- hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ hdr = arc_hdr_alloc(guid, psize, lsize,
BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
if (!embedded_bp) {
hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+ hdr->b_birth = BP_GET_BIRTH(bp);
exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
@@ -6135,7 +5714,6 @@ top:
arc_hdr_destroy(hdr);
goto top; /* restart the IO request */
}
- alloc_flags |= ARC_HDR_DO_ADAPT;
} else {
/*
* This block is in the ghost cache or encrypted data
@@ -6155,7 +5733,9 @@ top:
ASSERT0(zfs_refcount_count(
&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+#ifdef ZFS_DEBUG
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+#endif
} else if (HDR_IO_IN_PROGRESS(hdr)) {
/*
* If this header already had an IO in progress
@@ -6166,25 +5746,47 @@ top:
* and so the performance impact shouldn't
* matter.
*/
- cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+ arc_callback_t *acb = kmem_zalloc(
+ sizeof (arc_callback_t), KM_SLEEP);
+ acb->acb_wait = B_TRUE;
+ mutex_init(&acb->acb_wait_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
+ NULL);
+ acb->acb_zio_head =
+ hdr->b_l1hdr.b_acb->acb_zio_head;
+ acb->acb_next = hdr->b_l1hdr.b_acb;
+ hdr->b_l1hdr.b_acb->acb_prev = acb;
+ hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
+ mutex_enter(&acb->acb_wait_lock);
+ while (acb->acb_wait) {
+ cv_wait(&acb->acb_wait_cv,
+ &acb->acb_wait_lock);
+ }
+ mutex_exit(&acb->acb_wait_lock);
+ mutex_destroy(&acb->acb_wait_lock);
+ cv_destroy(&acb->acb_wait_cv);
+ kmem_free(acb, sizeof (arc_callback_t));
goto top;
}
-
- /*
- * This is a delicate dance that we play here.
- * This hdr might be in the ghost list so we access
- * it to move it out of the ghost list before we
- * initiate the read. If it's a prefetch then
- * it won't have a callback so we'll remove the
- * reference that arc_buf_alloc_impl() created. We
- * do this after we've called arc_access() to
- * avoid hitting an assert in remove_reference().
- */
- arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
- arc_access(hdr, hash_lock);
+ }
+ if (*arc_flags & ARC_FLAG_UNCACHED) {
+ arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
+ if (!encrypted_read)
+ alloc_flags |= ARC_HDR_ALLOC_LINEAR;
}
+ /*
+ * Take additional reference for IO_IN_PROGRESS. It stops
+ * arc_access() from putting this header without any buffers
+ * and so other references but obviously nonevictable onto
+ * the evictable list of MRU or MFU state.
+ */
+ add_reference(hdr, hdr);
+ if (!embedded_bp)
+ arc_access(hdr, *arc_flags, B_FALSE);
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
arc_hdr_alloc_abd(hdr, alloc_flags);
if (encrypted_read) {
ASSERT(HDR_HAS_RABD(hdr));
@@ -6211,24 +5813,10 @@ top:
zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
}
- if (*arc_flags & ARC_FLAG_PREFETCH &&
- zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_decrement_state(hdr);
- arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
- if (HDR_HAS_L2HDR(hdr))
- l2arc_hdr_arcstats_increment_state(hdr);
- }
- if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
- arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
- if (*arc_flags & ARC_FLAG_L2CACHE)
- arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (BP_IS_AUTHENTICATED(bp))
arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
if (BP_GET_LEVEL(bp) > 0)
arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
- if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
- arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -6241,7 +5829,6 @@ top:
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
hdr->b_l1hdr.b_acb = acb;
- arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
if (HDR_HAS_L2HDR(hdr) &&
(vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
@@ -6282,7 +5869,7 @@ top:
blkptr_t *, bp, uint64_t, lsize,
zbookmark_phys_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
- ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
metadata, misses);
zfs_racct_read(size, 1);
@@ -6300,11 +5887,9 @@ top:
* 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev.
- * 5. This isn't prefetch or l2arc_noprefetch is 0.
*/
if (HDR_HAS_L2HDR(hdr) &&
- !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
- !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
+ !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
l2arc_read_callback_t *cb;
abd_t *abd;
uint64_t asize;
@@ -6356,8 +5941,7 @@ top:
asize, abd,
ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
- zio_flags | ZIO_FLAG_DONT_CACHE |
- ZIO_FLAG_CANFAIL |
+ zio_flags | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
acb->acb_zio_head = rzio;
@@ -6436,6 +6020,16 @@ out:
spa_read_history_add(spa, zb, *arc_flags);
spl_fstrans_unmark(cookie);
return (rc);
+
+done:
+ if (done)
+ done(NULL, zb, bp, buf, private);
+ if (pio && rc != 0) {
+ zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
+ zio->io_error = rc;
+ zio_nowait(zio);
+ }
+ goto out;
}
arc_prune_t *
@@ -6476,6 +6070,56 @@ arc_remove_prune_callback(arc_prune_t *p)
}
/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+ arc_prune_t *ap = (arc_prune_t *)ptr;
+ arc_prune_func_t *func = ap->p_pfunc;
+
+ if (func != NULL)
+ func(ap->p_adjust, ap->p_private);
+
+ (void) zfs_refcount_remove(&ap->p_refcnt, func);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffers they reference. This provides a mechanism to ensure the ARC can
+ * honor the metadata limit and reclaim otherwise pinned ARC buffers.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread(). A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+static void
+arc_prune_async(uint64_t adjust)
+{
+ arc_prune_t *ap;
+
+ mutex_enter(&arc_prune_mtx);
+ for (ap = list_head(&arc_prune_list); ap != NULL;
+ ap = list_next(&arc_prune_list, ap)) {
+
+ if (zfs_refcount_count(&ap->p_refcnt) >= 2)
+ continue;
+
+ zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
+ ap->p_adjust = adjust;
+ if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
+ ap, TQ_SLEEP) == TASKQID_INVALID) {
+ (void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
+ continue;
+ }
+ ARCSTAT_BUMP(arcstat_prune);
+ }
+ mutex_exit(&arc_prune_mtx);
+}
+
+/*
* Notify the arc that a block was freed, and thus will never be used again.
*/
void
@@ -6493,10 +6137,8 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
/*
* We might be trying to free a block that is still doing I/O
- * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
- * dmu_sync-ed block). If this block is being prefetched, then it
- * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
- * until the I/O completes. A block may also have a reference if it is
+ * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
+ * dmu_sync-ed block). A block may also have a reference if it is
* part of a dedup-ed, dmu_synced write. The dmu_sync() function would
* have written the new block to its final resting place on disk but
* without the dedup flag set. This would have left the hdr in the MRU
@@ -6513,9 +6155,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
* freed. So if we have an I/O in progress, or a reference to
* this hdr, then we don't destroy the hdr.
*/
- if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
- zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
- arc_change_state(arc_anon, hdr, hash_lock);
+ if (!HDR_HAS_L1HDR(hdr) ||
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ arc_change_state(arc_anon, hdr);
arc_hdr_destroy(hdr);
mutex_exit(hash_lock);
} else {
@@ -6531,7 +6173,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
* a new hdr for the buffer.
*/
void
-arc_release(arc_buf_t *buf, void *tag)
+arc_release(arc_buf_t *buf, const void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
@@ -6541,8 +6183,6 @@ arc_release(arc_buf_t *buf, void *tag)
* But we don't know that information at this level.
*/
- mutex_enter(&buf->b_evict_lock);
-
ASSERT(HDR_HAS_L1HDR(hdr));
/*
@@ -6551,14 +6191,14 @@ arc_release(arc_buf_t *buf, void *tag)
* linked into the hash table.
*/
if (hdr->b_l1hdr.b_state == arc_anon) {
- mutex_exit(&buf->b_evict_lock);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(!HDR_IN_HASH_TABLE(hdr));
ASSERT(!HDR_HAS_L2HDR(hdr));
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+ ASSERT(ARC_BUF_LAST(buf));
ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
hdr->b_l1hdr.b_arc_access = 0;
@@ -6607,7 +6247,7 @@ arc_release(arc_buf_t *buf, void *tag)
/*
* Do we have more than one buf?
*/
- if (hdr->b_l1hdr.b_bufcnt > 1) {
+ if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
arc_buf_hdr_t *nhdr;
uint64_t spa = hdr->b_spa;
uint64_t psize = HDR_GET_PSIZE(hdr);
@@ -6618,9 +6258,9 @@ arc_release(arc_buf_t *buf, void *tag)
VERIFY3U(hdr->b_type, ==, type);
ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
- (void) remove_reference(hdr, hash_lock, tag);
+ VERIFY3S(remove_reference(hdr, tag), >, 0);
- if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
+ if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
ASSERT(ARC_BUF_LAST(buf));
}
@@ -6637,9 +6277,9 @@ arc_release(arc_buf_t *buf, void *tag)
* If the current arc_buf_t and the hdr are sharing their data
* buffer, then we must stop sharing that block.
*/
- if (arc_buf_is_shared(buf)) {
+ if (ARC_BUF_SHARED(buf)) {
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
- VERIFY(!arc_buf_is_shared(lastbuf));
+ ASSERT(!arc_buf_is_shared(lastbuf));
/*
* First, sever the block sharing relationship between
@@ -6656,7 +6296,7 @@ arc_release(arc_buf_t *buf, void *tag)
if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf);
} else {
- arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+ arc_hdr_alloc_abd(hdr, 0);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
buf->b_data, psize);
}
@@ -6672,13 +6312,13 @@ arc_release(arc_buf_t *buf, void *tag)
*/
ASSERT(arc_buf_is_shared(lastbuf) ||
arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
- ASSERT(!ARC_BUF_SHARED(buf));
+ ASSERT(!arc_buf_is_shared(buf));
}
ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
ASSERT3P(state, !=, arc_l2c_only);
- (void) zfs_refcount_remove_many(&state->arcs_size,
+ (void) zfs_refcount_remove_many(&state->arcs_size[type],
arc_buf_size(buf), buf);
if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
@@ -6688,10 +6328,6 @@ arc_release(arc_buf_t *buf, void *tag)
arc_buf_size(buf), buf);
}
- hdr->b_l1hdr.b_bufcnt -= 1;
- if (ARC_BUF_ENCRYPTED(buf))
- hdr->b_crypt_hdr.b_ebufcnt -= 1;
-
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
@@ -6701,30 +6337,20 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(hash_lock);
- /*
- * Allocate a new hdr. The new hdr will contain a b_pabd
- * buffer which will be freed in arc_write().
- */
nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
compress, hdr->b_complevel, type);
ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT0(nhdr->b_l1hdr.b_bufcnt);
ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
VERIFY3U(nhdr->b_type, ==, type);
ASSERT(!HDR_SHARED_DATA(nhdr));
nhdr->b_l1hdr.b_buf = buf;
- nhdr->b_l1hdr.b_bufcnt = 1;
- if (ARC_BUF_ENCRYPTED(buf))
- nhdr->b_crypt_hdr.b_ebufcnt = 1;
(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
- mutex_exit(&buf->b_evict_lock);
- (void) zfs_refcount_add_many(&arc_anon->arcs_size,
+ (void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
arc_buf_size(buf), buf);
} else {
- mutex_exit(&buf->b_evict_lock);
ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
/* protected by hash lock, or hdr is on arc_anon */
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
@@ -6733,7 +6359,7 @@ arc_release(arc_buf_t *buf, void *tag)
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
- arc_change_state(arc_anon, hdr, hash_lock);
+ arc_change_state(arc_anon, hdr);
hdr->b_l1hdr.b_arc_access = 0;
mutex_exit(hash_lock);
@@ -6745,25 +6371,15 @@ arc_release(arc_buf_t *buf, void *tag)
int
arc_released(arc_buf_t *buf)
{
- int released;
-
- mutex_enter(&buf->b_evict_lock);
- released = (buf->b_data != NULL &&
+ return (buf->b_data != NULL &&
buf->b_hdr->b_l1hdr.b_state == arc_anon);
- mutex_exit(&buf->b_evict_lock);
- return (released);
}
#ifdef ZFS_DEBUG
int
arc_referenced(arc_buf_t *buf)
{
- int referenced;
-
- mutex_enter(&buf->b_evict_lock);
- referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
- mutex_exit(&buf->b_evict_lock);
- return (referenced);
+ return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
}
#endif
@@ -6779,7 +6395,7 @@ arc_write_ready(zio_t *zio)
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
- ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
/*
* If we're reexecuting this zio because the pool suspended, then
@@ -6790,9 +6406,10 @@ arc_write_ready(zio_t *zio)
arc_cksum_free(hdr);
arc_buf_unwatch(buf);
if (hdr->b_l1hdr.b_pabd != NULL) {
- if (arc_buf_is_shared(buf)) {
+ if (ARC_BUF_SHARED(buf)) {
arc_unshare_buf(hdr, buf);
} else {
+ ASSERT(!arc_buf_is_shared(buf));
arc_hdr_free_abd(hdr, B_FALSE);
}
}
@@ -6807,18 +6424,16 @@ arc_write_ready(zio_t *zio)
callback->awcb_ready(zio, buf, callback->awcb_private);
- if (HDR_IO_IN_PROGRESS(hdr))
+ if (HDR_IO_IN_PROGRESS(hdr)) {
ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
-
- arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
-
- if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
- hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
+ }
if (BP_IS_PROTECTED(bp)) {
/* ZIL blocks are written through zio_rewrite */
ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
- ASSERT(HDR_PROTECTED(hdr));
if (BP_SHOULD_BYTESWAP(bp)) {
if (BP_GET_LEVEL(bp) > 0) {
@@ -6831,11 +6446,14 @@ arc_write_ready(zio_t *zio)
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
}
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
hdr->b_crypt_hdr.b_iv);
zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+ } else {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
}
/*
@@ -6886,10 +6504,11 @@ arc_write_ready(zio_t *zio)
if (ARC_BUF_ENCRYPTED(buf)) {
ASSERT3U(psize, >, 0);
ASSERT(ARC_BUF_COMPRESSED(buf));
- arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA |
+ arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
ARC_HDR_USE_RESERVE);
abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
- } else if (!abd_size_alloc_linear(arc_buf_size(buf)) ||
+ } else if (!(HDR_UNCACHED(hdr) ||
+ abd_size_alloc_linear(arc_buf_size(buf))) ||
!arc_can_share(hdr, buf)) {
/*
* Ideally, we would always copy the io_abd into b_pabd, but the
@@ -6898,26 +6517,25 @@ arc_write_ready(zio_t *zio)
*/
if (BP_IS_ENCRYPTED(bp)) {
ASSERT3U(psize, >, 0);
- arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
- ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE);
+ arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
+ ARC_HDR_USE_RESERVE);
abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
!ARC_BUF_COMPRESSED(buf)) {
ASSERT3U(psize, >, 0);
- arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
- ARC_HDR_USE_RESERVE);
+ arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
} else {
ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
- arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT |
- ARC_HDR_USE_RESERVE);
+ arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
}
} else {
ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+ ASSERT(ARC_BUF_LAST(buf));
arc_share_buf(hdr, buf);
}
@@ -6936,18 +6554,6 @@ arc_write_children_ready(zio_t *zio)
callback->awcb_children_ready(zio, buf, callback->awcb_private);
}
-/*
- * The SPA calls this callback for each physical write that happens on behalf
- * of a logical write. See the comment in dbuf_write_physdone() for details.
- */
-static void
-arc_write_physdone(zio_t *zio)
-{
- arc_write_callback_t *cb = zio->io_private;
- if (cb->awcb_physdone != NULL)
- cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
-}
-
static void
arc_write_done(zio_t *zio)
{
@@ -6964,7 +6570,7 @@ arc_write_done(zio_t *zio)
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
}
} else {
ASSERT(HDR_EMPTY(hdr));
@@ -6997,7 +6603,7 @@ arc_write_done(zio_t *zio)
(void *)hdr, (void *)exists);
ASSERT(zfs_refcount_is_zero(
&exists->b_l1hdr.b_refcnt));
- arc_change_state(arc_anon, exists, hash_lock);
+ arc_change_state(arc_anon, exists);
arc_hdr_destroy(exists);
mutex_exit(hash_lock);
exists = buf_hash_insert(hdr, &hash_lock);
@@ -7010,22 +6616,24 @@ arc_write_done(zio_t *zio)
(void *)hdr, (void *)exists);
} else {
/* Dedup */
- ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
+ ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(BP_GET_DEDUP(zio->io_bp));
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
}
}
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ VERIFY3S(remove_reference(hdr, hdr), >, 0);
/* if it's not anon, we are doing a scrub */
if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
- arc_access(hdr, hash_lock);
+ arc_access(hdr, 0, B_FALSE);
mutex_exit(hash_lock);
} else {
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ VERIFY3S(remove_reference(hdr, hdr), >, 0);
}
- ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
abd_free(zio->io_abd);
@@ -7034,11 +6642,11 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
- blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
const zio_prop_t *zp, arc_write_done_func_t *ready,
- arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
- arc_write_done_func_t *done, void *private, zio_priority_t priority,
- int zio_flags, const zbookmark_phys_t *zb)
+ arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
+ void *private, zio_priority_t priority, int zio_flags,
+ const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
@@ -7050,8 +6658,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
- ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
- if (l2arc)
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
+ if (uncached)
+ arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
+ else if (l2arc)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (ARC_BUF_ENCRYPTED(buf)) {
@@ -7062,11 +6672,11 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
localprop.zp_byteorder =
(hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
- bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
+ memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
ZIO_DATA_SALT_LEN);
- bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
+ memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
ZIO_DATA_IV_LEN);
- bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
+ memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
ZIO_DATA_MAC_LEN);
if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
localprop.zp_nopwrite = B_FALSE;
@@ -7083,7 +6693,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
callback->awcb_children_ready = children_ready;
- callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
@@ -7099,9 +6708,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
* The hdr will remain with a NULL data pointer and the
* buf will take sole ownership of the block.
*/
- if (arc_buf_is_shared(buf)) {
+ if (ARC_BUF_SHARED(buf)) {
arc_unshare_buf(hdr, buf);
} else {
+ ASSERT(!arc_buf_is_shared(buf));
arc_hdr_free_abd(hdr, B_FALSE);
}
VERIFY3P(buf->b_data, !=, NULL);
@@ -7120,8 +6730,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
- arc_write_physdone, arc_write_done, callback,
- priority, zio_flags, zb);
+ arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
@@ -7162,7 +6771,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
/* assert that it has not wrapped around */
ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
- anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
+ anon_size = MAX((int64_t)
+ (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
+ zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
arc_loaned_bytes), 0);
/*
@@ -7218,9 +6829,14 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
static void
arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+ kstat_named_t *data, kstat_named_t *metadata,
kstat_named_t *evict_data, kstat_named_t *evict_metadata)
{
- size->value.ui64 = zfs_refcount_count(&state->arcs_size);
+ data->value.ui64 =
+ zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
+ metadata->value.ui64 =
+ zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
+ size->value.ui64 = data->value.ui64 + metadata->value.ui64;
evict_data->value.ui64 =
zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
evict_metadata->value.ui64 =
@@ -7237,22 +6853,32 @@ arc_kstat_update(kstat_t *ksp, int rw)
as->arcstat_hits.value.ui64 =
wmsum_value(&arc_sums.arcstat_hits);
+ as->arcstat_iohits.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_iohits);
as->arcstat_misses.value.ui64 =
wmsum_value(&arc_sums.arcstat_misses);
as->arcstat_demand_data_hits.value.ui64 =
wmsum_value(&arc_sums.arcstat_demand_data_hits);
+ as->arcstat_demand_data_iohits.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_demand_data_iohits);
as->arcstat_demand_data_misses.value.ui64 =
wmsum_value(&arc_sums.arcstat_demand_data_misses);
as->arcstat_demand_metadata_hits.value.ui64 =
wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
+ as->arcstat_demand_metadata_iohits.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
as->arcstat_demand_metadata_misses.value.ui64 =
wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
as->arcstat_prefetch_data_hits.value.ui64 =
wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
+ as->arcstat_prefetch_data_iohits.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
as->arcstat_prefetch_data_misses.value.ui64 =
wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
as->arcstat_prefetch_metadata_hits.value.ui64 =
wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
+ as->arcstat_prefetch_metadata_iohits.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
as->arcstat_prefetch_metadata_misses.value.ui64 =
wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
as->arcstat_mru_hits.value.ui64 =
@@ -7263,6 +6889,8 @@ arc_kstat_update(kstat_t *ksp, int rw)
wmsum_value(&arc_sums.arcstat_mfu_hits);
as->arcstat_mfu_ghost_hits.value.ui64 =
wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
+ as->arcstat_uncached_hits.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_uncached_hits);
as->arcstat_deleted.value.ui64 =
wmsum_value(&arc_sums.arcstat_deleted);
as->arcstat_mutex_miss.value.ui64 =
@@ -7308,33 +6936,49 @@ arc_kstat_update(kstat_t *ksp, int rw)
#if defined(COMPAT_FREEBSD11)
as->arcstat_other_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_bonus_size) +
- aggsum_value(&arc_sums.arcstat_dnode_size) +
+ wmsum_value(&arc_sums.arcstat_dnode_size) +
wmsum_value(&arc_sums.arcstat_dbuf_size);
#endif
arc_kstat_update_state(arc_anon,
&as->arcstat_anon_size,
+ &as->arcstat_anon_data,
+ &as->arcstat_anon_metadata,
&as->arcstat_anon_evictable_data,
&as->arcstat_anon_evictable_metadata);
arc_kstat_update_state(arc_mru,
&as->arcstat_mru_size,
+ &as->arcstat_mru_data,
+ &as->arcstat_mru_metadata,
&as->arcstat_mru_evictable_data,
&as->arcstat_mru_evictable_metadata);
arc_kstat_update_state(arc_mru_ghost,
&as->arcstat_mru_ghost_size,
+ &as->arcstat_mru_ghost_data,
+ &as->arcstat_mru_ghost_metadata,
&as->arcstat_mru_ghost_evictable_data,
&as->arcstat_mru_ghost_evictable_metadata);
arc_kstat_update_state(arc_mfu,
&as->arcstat_mfu_size,
+ &as->arcstat_mfu_data,
+ &as->arcstat_mfu_metadata,
&as->arcstat_mfu_evictable_data,
&as->arcstat_mfu_evictable_metadata);
arc_kstat_update_state(arc_mfu_ghost,
&as->arcstat_mfu_ghost_size,
+ &as->arcstat_mfu_ghost_data,
+ &as->arcstat_mfu_ghost_metadata,
&as->arcstat_mfu_ghost_evictable_data,
&as->arcstat_mfu_ghost_evictable_metadata);
+ arc_kstat_update_state(arc_uncached,
+ &as->arcstat_uncached_size,
+ &as->arcstat_uncached_data,
+ &as->arcstat_uncached_metadata,
+ &as->arcstat_uncached_evictable_data,
+ &as->arcstat_uncached_evictable_metadata);
as->arcstat_dnode_size.value.ui64 =
- aggsum_value(&arc_sums.arcstat_dnode_size);
+ wmsum_value(&arc_sums.arcstat_dnode_size);
as->arcstat_bonus_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_bonus_size);
as->arcstat_l2_hits.value.ui64 =
@@ -7432,13 +7076,21 @@ arc_kstat_update(kstat_t *ksp, int rw)
as->arcstat_prune.value.ui64 =
wmsum_value(&arc_sums.arcstat_prune);
as->arcstat_meta_used.value.ui64 =
- aggsum_value(&arc_sums.arcstat_meta_used);
+ wmsum_value(&arc_sums.arcstat_meta_used);
as->arcstat_async_upgrade_sync.value.ui64 =
wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
+ as->arcstat_predictive_prefetch.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_predictive_prefetch);
as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
+ as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
+ as->arcstat_prescient_prefetch.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_prescient_prefetch);
as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
+ as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
+ wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
as->arcstat_raw_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_raw_size);
as->arcstat_cached_only_in_progress.value.ui64 =
@@ -7510,7 +7162,6 @@ void
arc_tuning_update(boolean_t verbose)
{
uint64_t allmem = arc_all_memory();
- unsigned long limit;
/* Valid range: 32M - <arc_c_max> */
if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
@@ -7527,44 +7178,15 @@ arc_tuning_update(boolean_t verbose)
(zfs_arc_max > arc_c_min)) {
arc_c_max = zfs_arc_max;
arc_c = MIN(arc_c, arc_c_max);
- arc_p = (arc_c >> 1);
- if (arc_meta_limit > arc_c_max)
- arc_meta_limit = arc_c_max;
- if (arc_dnode_size_limit > arc_meta_limit)
- arc_dnode_size_limit = arc_meta_limit;
+ if (arc_dnode_limit > arc_c_max)
+ arc_dnode_limit = arc_c_max;
}
WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
- /* Valid range: 16M - <arc_c_max> */
- if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
- (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
- (zfs_arc_meta_min <= arc_c_max)) {
- arc_meta_min = zfs_arc_meta_min;
- if (arc_meta_limit < arc_meta_min)
- arc_meta_limit = arc_meta_min;
- if (arc_dnode_size_limit < arc_meta_min)
- arc_dnode_size_limit = arc_meta_min;
- }
- WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose);
-
- /* Valid range: <arc_meta_min> - <arc_c_max> */
- limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
- MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
- if ((limit != arc_meta_limit) &&
- (limit >= arc_meta_min) &&
- (limit <= arc_c_max))
- arc_meta_limit = limit;
- WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose);
-
- /* Valid range: <arc_meta_min> - <arc_meta_limit> */
- limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
- MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
- if ((limit != arc_dnode_size_limit) &&
- (limit >= arc_meta_min) &&
- (limit <= arc_meta_limit))
- arc_dnode_size_limit = limit;
- WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit,
- verbose);
+ /* Valid range: 0 - <all physical memory> */
+ arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
+ MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
+ WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
/* Valid range: 1 - N */
if (zfs_arc_grow_retry)
@@ -7576,10 +7198,6 @@ arc_tuning_update(boolean_t verbose)
arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
}
- /* Valid range: 1 - N */
- if (zfs_arc_p_min_shift)
- arc_p_min_shift = zfs_arc_p_min_shift;
-
/* Valid range: 1 - N ms */
if (zfs_arc_min_prefetch_ms)
arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
@@ -7591,65 +7209,67 @@ arc_tuning_update(boolean_t verbose)
}
/* Valid range: 0 - 100 */
- if ((zfs_arc_lotsfree_percent >= 0) &&
- (zfs_arc_lotsfree_percent <= 100))
+ if (zfs_arc_lotsfree_percent <= 100)
arc_lotsfree_percent = zfs_arc_lotsfree_percent;
WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
verbose);
/* Valid range: 0 - <all physical memory> */
if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
- arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
+ arc_sys_free = MIN(zfs_arc_sys_free, allmem);
WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
}
static void
+arc_state_multilist_init(multilist_t *ml,
+ multilist_sublist_index_func_t *index_func, int *maxcountp)
+{
+ multilist_create(ml, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
+ *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
+}
+
+static void
arc_state_init(void)
{
- multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
- multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_multilist_index_func);
+ int num_sublists = 0;
+
+ arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
+ arc_state_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
+ arc_state_multilist_index_func, &num_sublists);
+
/*
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated. Special index function asserts that.
*/
- multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_l2c_multilist_index_func);
- multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
- arc_state_l2c_multilist_index_func);
+ arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ arc_state_l2c_multilist_index_func, &num_sublists);
+ arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ arc_state_l2c_multilist_index_func, &num_sublists);
+
+ /*
+ * Keep track of the number of markers needed to reclaim buffers from
+ * any ARC state. The markers will be pre-allocated so as to minimize
+ * the number of memory allocations performed by the eviction thread.
+ */
+ arc_state_evict_marker_count = num_sublists;
zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
@@ -7663,28 +7283,49 @@ arc_state_init(void)
zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
- zfs_refcount_create(&arc_anon->arcs_size);
- zfs_refcount_create(&arc_mru->arcs_size);
- zfs_refcount_create(&arc_mru_ghost->arcs_size);
- zfs_refcount_create(&arc_mfu->arcs_size);
- zfs_refcount_create(&arc_mfu_ghost->arcs_size);
- zfs_refcount_create(&arc_l2c_only->arcs_size);
+ zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
+
+ zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
+
+ wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
+ wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
+ wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
+ wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
wmsum_init(&arc_sums.arcstat_hits, 0);
+ wmsum_init(&arc_sums.arcstat_iohits, 0);
wmsum_init(&arc_sums.arcstat_misses, 0);
wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
+ wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
+ wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
+ wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
+ wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
wmsum_init(&arc_sums.arcstat_mru_hits, 0);
wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
+ wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
wmsum_init(&arc_sums.arcstat_deleted, 0);
wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
wmsum_init(&arc_sums.arcstat_access_skip, 0);
@@ -7706,7 +7347,7 @@ arc_state_init(void)
wmsum_init(&arc_sums.arcstat_data_size, 0);
wmsum_init(&arc_sums.arcstat_metadata_size, 0);
wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
- aggsum_init(&arc_sums.arcstat_dnode_size, 0);
+ wmsum_init(&arc_sums.arcstat_dnode_size, 0);
wmsum_init(&arc_sums.arcstat_bonus_size, 0);
wmsum_init(&arc_sums.arcstat_l2_hits, 0);
wmsum_init(&arc_sums.arcstat_l2_misses, 0);
@@ -7751,10 +7392,14 @@ arc_state_init(void)
wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
wmsum_init(&arc_sums.arcstat_prune, 0);
- aggsum_init(&arc_sums.arcstat_meta_used, 0);
+ wmsum_init(&arc_sums.arcstat_meta_used, 0);
wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
+ wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
+ wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
+ wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
+ wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
wmsum_init(&arc_sums.arcstat_raw_size, 0);
wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
@@ -7765,6 +7410,7 @@ arc_state_init(void)
arc_mfu->arcs_state = ARC_STATE_MFU;
arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
+ arc_uncached->arcs_state = ARC_STATE_UNCACHED;
}
static void
@@ -7782,13 +7428,23 @@ arc_state_fini(void)
zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
-
- zfs_refcount_destroy(&arc_anon->arcs_size);
- zfs_refcount_destroy(&arc_mru->arcs_size);
- zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
- zfs_refcount_destroy(&arc_mfu->arcs_size);
- zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
- zfs_refcount_destroy(&arc_l2c_only->arcs_size);
+ zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
+
+ zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
+ zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
+ zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
@@ -7800,21 +7456,34 @@ arc_state_fini(void)
multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
+
+ wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
+ wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
+ wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
+ wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
wmsum_fini(&arc_sums.arcstat_hits);
+ wmsum_fini(&arc_sums.arcstat_iohits);
wmsum_fini(&arc_sums.arcstat_misses);
wmsum_fini(&arc_sums.arcstat_demand_data_hits);
+ wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
wmsum_fini(&arc_sums.arcstat_demand_data_misses);
wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
+ wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
+ wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
+ wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
wmsum_fini(&arc_sums.arcstat_mru_hits);
wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
wmsum_fini(&arc_sums.arcstat_mfu_hits);
wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
+ wmsum_fini(&arc_sums.arcstat_uncached_hits);
wmsum_fini(&arc_sums.arcstat_deleted);
wmsum_fini(&arc_sums.arcstat_mutex_miss);
wmsum_fini(&arc_sums.arcstat_access_skip);
@@ -7836,7 +7505,7 @@ arc_state_fini(void)
wmsum_fini(&arc_sums.arcstat_data_size);
wmsum_fini(&arc_sums.arcstat_metadata_size);
wmsum_fini(&arc_sums.arcstat_dbuf_size);
- aggsum_fini(&arc_sums.arcstat_dnode_size);
+ wmsum_fini(&arc_sums.arcstat_dnode_size);
wmsum_fini(&arc_sums.arcstat_bonus_size);
wmsum_fini(&arc_sums.arcstat_l2_hits);
wmsum_fini(&arc_sums.arcstat_l2_misses);
@@ -7881,10 +7550,14 @@ arc_state_fini(void)
wmsum_fini(&arc_sums.arcstat_memory_direct_count);
wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
wmsum_fini(&arc_sums.arcstat_prune);
- aggsum_fini(&arc_sums.arcstat_meta_used);
+ wmsum_fini(&arc_sums.arcstat_meta_used);
wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
+ wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
+ wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
+ wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
+ wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
wmsum_fini(&arc_sums.arcstat_raw_size);
wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
@@ -7949,18 +7622,16 @@ arc_init(void)
#endif
arc_c = arc_c_min;
- arc_p = (arc_c >> 1);
-
- /* Set min to 1/2 of arc_c_min */
- arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
/*
- * Set arc_meta_limit to a percent of arc_c_max with a floor of
- * arc_meta_min, and a ceiling of arc_c_max.
+ * 32-bit fixed point fractions of metadata from total ARC size,
+ * MRU data from all data and MRU metadata from all metadata.
*/
- percent = MIN(zfs_arc_meta_limit_percent, 100);
- arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
+ arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */
+ arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */
+ arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */
+
percent = MIN(zfs_arc_dnode_limit_percent, 100);
- arc_dnode_size_limit = (percent * arc_meta_limit) / 100;
+ arc_dnode_limit = arc_c_max * percent / 100;
/* Apply user specified tunings */
arc_tuning_update(B_TRUE);
@@ -7981,9 +7652,8 @@ arc_init(void)
offsetof(arc_prune_t, p_node));
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
- arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
- boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
- TASKQ_THREADS_CPU_PCT);
+ arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
+ defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -7994,8 +7664,10 @@ arc_init(void)
kstat_install(arc_ksp);
}
- arc_evict_zthr = zthr_create("arc_evict",
- arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri);
+ arc_state_evict_markers =
+ arc_state_alloc_markers(arc_state_evict_marker_count);
+ arc_evict_zthr = zthr_create_timer("arc_evict",
+ arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
arc_reap_zthr = zthr_create_timer("arc_reap",
arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
@@ -8060,9 +7732,8 @@ arc_fini(void)
taskq_destroy(arc_prune_taskq);
mutex_enter(&arc_prune_mtx);
- while ((p = list_head(&arc_prune_list)) != NULL) {
- list_remove(&arc_prune_list, p);
- zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
+ while ((p = list_remove_head(&arc_prune_list)) != NULL) {
+ (void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
zfs_refcount_destroy(&p->p_refcnt);
kmem_free(p, sizeof (*p));
}
@@ -8073,6 +7744,8 @@ arc_fini(void)
(void) zthr_cancel(arc_evict_zthr);
(void) zthr_cancel(arc_reap_zthr);
+ arc_state_free_markers(arc_state_evict_markers,
+ arc_state_evict_marker_count);
mutex_destroy(&arc_evict_lock);
list_destroy(&arc_evict_waiters);
@@ -8367,7 +8040,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size, dev_size, tsize;
+ uint64_t size;
/*
* Make sure our globals have meaningful values in case the user
@@ -8375,38 +8048,33 @@ l2arc_write_size(l2arc_dev_t *dev)
*/
size = l2arc_write_max;
if (size == 0) {
- cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
- "be greater than zero, resetting it to the default (%d)",
- L2ARC_WRITE_SIZE);
+ cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
+ "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
size = l2arc_write_max = L2ARC_WRITE_SIZE;
}
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
+ /* We need to add in the worst case scenario of log block overhead. */
+ size += l2arc_log_blk_overhead(size, dev);
+ if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
+ /*
+ * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+ * times the writesize, whichever is greater.
+ */
+ size += MAX(64 * 1024 * 1024,
+ (size * l2arc_trim_ahead) / 100);
+ }
+
/*
* Make sure the write size does not exceed the size of the cache
* device. This is important in l2arc_evict(), otherwise infinite
* iteration can occur.
*/
- dev_size = dev->l2ad_end - dev->l2ad_start;
- tsize = size + l2arc_log_blk_overhead(size, dev);
- if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
- tsize += MAX(64 * 1024 * 1024,
- (tsize * l2arc_trim_ahead) / 100);
+ size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
- if (tsize >= dev_size) {
- cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
- "plus the overhead of log blocks (persistent L2ARC, "
- "%llu bytes) exceeds the size of the cache device "
- "(guid %llu), resetting them to the default (%d)",
- (u_longlong_t)l2arc_log_blk_overhead(size, dev),
- (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
- size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
-
- if (arc_warm == B_FALSE)
- size += l2arc_write_boost;
- }
+ size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
return (size);
@@ -8473,12 +8141,13 @@ l2arc_dev_get_next(void)
else if (next == first)
break;
+ ASSERT3P(next, !=, NULL);
} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
- next->l2ad_trim_all);
+ next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting);
/* if we were unable to find any usable vdevs, return NULL */
if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
- next->l2ad_trim_all)
+ next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting)
next = NULL;
l2arc_dev_last = next;
@@ -8503,20 +8172,14 @@ out:
static void
l2arc_do_free_on_write(void)
{
- list_t *buflist;
- l2arc_data_free_t *df, *df_prev;
+ l2arc_data_free_t *df;
mutex_enter(&l2arc_free_on_write_mtx);
- buflist = l2arc_free_on_write;
-
- for (df = list_tail(buflist); df; df = df_prev) {
- df_prev = list_prev(buflist, df);
+ while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
ASSERT3P(df->l2df_abd, !=, NULL);
abd_free(df->l2df_abd);
- list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
-
mutex_exit(&l2arc_free_on_write_mtx);
}
@@ -8651,7 +8314,8 @@ top:
ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
lb_ptr_buf);
- zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ (void) zfs_refcount_remove(&dev->l2ad_lb_count,
+ lb_ptr_buf);
kmem_free(lb_ptr_buf->lb_ptr,
sizeof (l2arc_log_blkptr_t));
kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
@@ -8676,14 +8340,15 @@ top:
* block pointer in the header.
*/
if (i == 0) {
- bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ memset(l2dhdr, 0,
+ dev->l2ad_dev_hdr_asize);
} else {
- bzero(&l2dhdr->dh_start_lbps[i],
+ memset(&l2dhdr->dh_start_lbps[i], 0,
sizeof (l2arc_log_blkptr_t));
}
break;
}
- bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
+ memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
sizeof (l2arc_log_blkptr_t));
lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
lb_ptr_buf);
@@ -8732,7 +8397,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
*/
if (BP_IS_ENCRYPTED(bp)) {
abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
- ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
+ ARC_HDR_USE_RESERVE);
zio_crypt_decode_params_bp(bp, salt, iv);
zio_crypt_decode_mac_bp(bp, mac);
@@ -8769,7 +8434,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
!HDR_COMPRESSION_ENABLED(hdr)) {
abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
- ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE);
+ ARC_HDR_USE_RESERVE);
void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@@ -8981,7 +8646,7 @@ l2arc_sublist_lock(int list_num)
* sublists being selected.
*/
idx = multilist_get_random_index(ml);
- return (multilist_sublist_lock(ml, idx));
+ return (multilist_sublist_lock_idx(ml, idx));
}
/*
@@ -9026,22 +8691,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
buflist = &dev->l2ad_buflist;
- /*
- * We need to add in the worst case scenario of log block overhead.
- */
- distance += l2arc_log_blk_overhead(distance, dev);
- if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
- /*
- * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
- * times the write size, whichever is greater.
- */
- distance += MAX(64 * 1024 * 1024,
- (distance * l2arc_trim_ahead) / 100);
- }
-
top:
rerun = B_FALSE;
- if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
+ if (dev->l2ad_hand + distance > dev->l2ad_end) {
/*
* When there is no space to accommodate upcoming writes,
* evict to the end. Then bump the write and evict hands
@@ -9134,7 +8786,8 @@ retry:
ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
lb_ptr_buf);
- zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ (void) zfs_refcount_remove(&dev->l2ad_lb_count,
+ lb_ptr_buf);
list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
kmem_free(lb_ptr_buf->lb_ptr,
sizeof (l2arc_log_blkptr_t));
@@ -9190,7 +8843,7 @@ retry:
* arc_hdr_destroy() will call list_remove()
* and decrement arcstat_l2_lsize.
*/
- arc_change_state(arc_anon, hdr, hash_lock);
+ arc_change_state(arc_anon, hdr);
arc_hdr_destroy(hdr);
} else {
ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
@@ -9233,9 +8886,9 @@ out:
* assertions may be violated without functional consequences
* as the device is about to be removed.
*/
- ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+ ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
if (!dev->l2ad_first)
- ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
+ ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
}
}
@@ -9249,7 +8902,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
abd_t **abd_out)
{
int ret;
- void *tmp = NULL;
abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
enum zio_compress compress = HDR_GET_COMPRESS(hdr);
uint64_t psize = HDR_GET_PSIZE(hdr);
@@ -9270,12 +8922,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
* and copy the data. This may be done to eliminate a dependency on a
* shared buffer or to reallocate the buffer to match asize.
*/
- if (HDR_HAS_RABD(hdr) && asize != psize) {
- ASSERT3U(asize, >=, psize);
+ if (HDR_HAS_RABD(hdr)) {
+ ASSERT3U(asize, >, psize);
to_write = abd_alloc_for_io(asize, ismd);
abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
- if (psize != asize)
- abd_zero_off(to_write, psize, asize - psize);
+ abd_zero_off(to_write, psize, asize - psize);
goto out;
}
@@ -9284,36 +8935,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
ASSERT3U(size, ==, psize);
to_write = abd_alloc_for_io(asize, ismd);
abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
- if (size != asize)
+ if (asize > size)
abd_zero_off(to_write, size, asize - size);
goto out;
}
if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
- cabd = abd_alloc_for_io(asize, ismd);
- tmp = abd_borrow_buf(cabd, asize);
-
- psize = zio_compress_data(compress, to_write, tmp, size,
- hdr->b_complevel);
-
- if (psize >= size) {
- abd_return_buf(cabd, tmp, asize);
- HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
- to_write = cabd;
- abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
- if (size != asize)
- abd_zero_off(to_write, size, asize - size);
- goto encrypt;
+ size_t bufsize = MAX(size, asize);
+ void *buf = zio_buf_alloc(bufsize);
+ uint64_t csize = zio_compress_data(compress, to_write, &buf,
+ size, hdr->b_complevel);
+ if (csize > psize) {
+ /*
+ * We can't re-compress the block into the original
+ * psize. Even if it fits into asize, it does not
+ * matter, since checksum will never match on read.
+ */
+ zio_buf_free(buf, bufsize);
+ return (SET_ERROR(EIO));
}
- ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
- if (psize < asize)
- bzero((char *)tmp + psize, asize - psize);
- psize = HDR_GET_PSIZE(hdr);
- abd_return_buf_copy(cabd, tmp, asize);
- to_write = cabd;
+ if (asize > csize)
+ memset((char *)buf + csize, 0, asize - csize);
+ to_write = cabd = abd_get_from_buf(buf, bufsize);
+ abd_take_ownership_of_buf(cabd, B_TRUE);
}
-encrypt:
if (HDR_ENCRYPTED(hdr)) {
eabd = abd_alloc_for_io(asize, ismd);
@@ -9342,7 +8988,7 @@ encrypt:
abd_zero_off(eabd, psize, asize - psize);
/* assert that the MAC we got here matches the one we saved */
- ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
+ ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
spa_keystore_dsl_key_rele(spa, dck, FTAG);
if (to_write == cabd)
@@ -9394,9 +9040,9 @@ l2arc_blk_fetch_done(zio_t *zio)
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
- arc_buf_hdr_t *hdr, *hdr_prev, *head;
- uint64_t write_asize, write_psize, write_lsize, headroom;
- boolean_t full;
+ arc_buf_hdr_t *hdr, *head, *marker;
+ uint64_t write_asize, write_psize, headroom;
+ boolean_t full, from_head = !arc_warm;
l2arc_write_callback_t *cb = NULL;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
@@ -9405,10 +9051,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT3P(dev->l2ad_vdev, !=, NULL);
pio = NULL;
- write_lsize = write_asize = write_psize = 0;
+ write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+ marker = arc_state_alloc_marker();
/*
* Copy buffers for L2ARC writing.
@@ -9423,40 +9070,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
continue;
}
- multilist_sublist_t *mls = l2arc_sublist_lock(pass);
uint64_t passed_sz = 0;
-
- VERIFY3P(mls, !=, NULL);
+ headroom = target_sz * l2arc_headroom;
+ if (zfs_compressed_arc_enabled)
+ headroom = (headroom * l2arc_headroom_boost) / 100;
/*
- * L2ARC fast warmup.
- *
* Until the ARC is warm and starts to evict, read from the
* head of the ARC lists rather than the tail.
*/
- if (arc_warm == B_FALSE)
+ multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+ ASSERT3P(mls, !=, NULL);
+ if (from_head)
hdr = multilist_sublist_head(mls);
else
hdr = multilist_sublist_tail(mls);
- headroom = target_sz * l2arc_headroom;
- if (zfs_compressed_arc_enabled)
- headroom = (headroom * l2arc_headroom_boost) / 100;
-
- for (; hdr; hdr = hdr_prev) {
+ while (hdr != NULL) {
kmutex_t *hash_lock;
abd_t *to_write = NULL;
- if (arc_warm == B_FALSE)
- hdr_prev = multilist_sublist_next(mls, hdr);
- else
- hdr_prev = multilist_sublist_prev(mls, hdr);
-
hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
- /*
- * Skip this buffer rather than waiting.
- */
+skip:
+ /* Skip this buffer rather than waiting. */
+ if (from_head)
+ hdr = multilist_sublist_next(mls, hdr);
+ else
+ hdr = multilist_sublist_prev(mls, hdr);
continue;
}
@@ -9471,17 +9112,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
if (!l2arc_write_eligible(guid, hdr)) {
mutex_exit(hash_lock);
- continue;
+ goto skip;
}
- /*
- * We rely on the L1 portion of the header below, so
- * it's invalid for this header to have been evicted out
- * of the ghost cache, prior to being written out. The
- * ARC_FLAG_L2_WRITING bit ensures this won't happen.
- */
ASSERT(HDR_HAS_L1HDR(hdr));
-
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
ASSERT3U(arc_hdr_size(hdr), >, 0);
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9490,25 +9124,31 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
psize);
- if ((write_asize + asize) > target_sz) {
+ /*
+ * If the allocated size of this buffer plus the max
+ * size for the pending log block exceeds the evicted
+ * target size, terminate writing buffers for this run.
+ */
+ if (write_asize + asize +
+ sizeof (l2arc_log_blk_phys_t) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
break;
}
/*
- * We rely on the L1 portion of the header below, so
- * it's invalid for this header to have been evicted out
- * of the ghost cache, prior to being written out. The
- * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ * We should not sleep with sublist lock held or it
+ * may block ARC eviction. Insert a marker to save
+ * the position and drop the lock.
*/
- arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
- ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
- HDR_HAS_RABD(hdr));
- ASSERT3U(arc_hdr_size(hdr), >, 0);
+ if (from_head) {
+ multilist_sublist_insert_after(mls, hdr,
+ marker);
+ } else {
+ multilist_sublist_insert_before(mls, hdr,
+ marker);
+ }
+ multilist_sublist_unlock(mls);
/*
* If this header has b_rabd, we can use this since it
@@ -9539,32 +9179,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
&to_write);
if (ret != 0) {
arc_hdr_clear_flags(hdr,
- ARC_FLAG_L2_WRITING);
+ ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
- continue;
+ goto next;
}
l2arc_free_abd_on_write(to_write, asize, type);
}
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+ hdr->b_l2hdr.b_hits = 0;
+ hdr->b_l2hdr.b_arcs_state =
+ hdr->b_l1hdr.b_state->arcs_state;
+ mutex_enter(&dev->l2ad_mtx);
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
- mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, head);
- mutex_exit(&dev->l2ad_mtx);
+ }
+ list_insert_head(&dev->l2ad_buflist, hdr);
+ mutex_exit(&dev->l2ad_mtx);
+ arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+ ARC_FLAG_L2_WRITING);
+
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
+ l2arc_hdr_arcstats_increment(hdr);
+ boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+ mutex_exit(hash_lock);
+
+ if (pio == NULL) {
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
- /*
- * Create a list to save allocated abd buffers
- * for l2arc_log_blk_commit().
- */
list_create(&cb->l2wcb_abd_list,
sizeof (l2arc_lb_abd_buf_t),
offsetof(l2arc_lb_abd_buf_t, node));
@@ -9572,48 +9225,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ZIO_FLAG_CANFAIL);
}
- hdr->b_l2hdr.b_dev = dev;
- hdr->b_l2hdr.b_hits = 0;
-
- hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
- hdr->b_l2hdr.b_arcs_state =
- hdr->b_l1hdr.b_state->arcs_state;
- arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
- mutex_enter(&dev->l2ad_mtx);
- list_insert_head(&dev->l2ad_buflist, hdr);
- mutex_exit(&dev->l2ad_mtx);
-
- (void) zfs_refcount_add_many(&dev->l2ad_alloc,
- arc_hdr_size(hdr), hdr);
-
wzio = zio_write_phys(pio, dev->l2ad_vdev,
- hdr->b_l2hdr.b_daddr, asize, to_write,
+ dev->l2ad_hand, asize, to_write,
ZIO_CHECKSUM_OFF, NULL, hdr,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);
- write_lsize += HDR_GET_LSIZE(hdr);
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
zio_t *, wzio);
+ zio_nowait(wzio);
write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
- l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
- mutex_exit(hash_lock);
-
- /*
- * Append buf info to current log and commit if full.
- * arcstat_l2_{size,asize} kstats are updated
- * internally.
- */
- if (l2arc_log_blk_insert(dev, hdr))
- l2arc_log_blk_commit(dev, pio, cb);
+ if (commit) {
+ /* l2ad_hand will be adjusted inside. */
+ write_asize +=
+ l2arc_log_blk_commit(dev, pio, cb);
+ }
- zio_nowait(wzio);
+next:
+ multilist_sublist_lock(mls);
+ if (from_head)
+ hdr = multilist_sublist_next(mls, marker);
+ else
+ hdr = multilist_sublist_prev(mls, marker);
+ multilist_sublist_remove(mls, marker);
}
multilist_sublist_unlock(mls);
@@ -9622,9 +9261,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}
+ arc_state_free_marker(marker);
+
/* No buffers selected for writing? */
if (pio == NULL) {
- ASSERT0(write_lsize);
+ ASSERT0(write_psize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
@@ -9664,7 +9305,7 @@ l2arc_hdr_limit_reached(void)
{
int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
- return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
+ return (arc_reclaim_needed() ||
(s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
}
@@ -9672,10 +9313,10 @@ l2arc_hdr_limit_reached(void)
* This thread feeds the L2ARC at regular intervals. This is the beating
* heart of the L2ARC.
*/
-/* ARGSUSED */
-static void
+static __attribute__((noreturn)) void
l2arc_feed_thread(void *unused)
{
+ (void) unused;
callb_cpr_t cpr;
l2arc_dev_t *dev;
spa_t *spa;
@@ -9863,7 +9504,7 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
if (l2arc_trim_ahead > 0) {
dev->l2ad_trim_all = B_TRUE;
} else {
- bzero(l2dhdr, l2dhdr_asize);
+ memset(l2dhdr, 0, l2dhdr_asize);
l2arc_dev_hdr_update(dev);
}
}
@@ -10111,7 +9752,7 @@ l2arc_spa_rebuild_start(spa_t *spa)
/*
* Main entry point for L2ARC rebuilding.
*/
-static void
+static __attribute__((noreturn)) void
l2arc_dev_rebuild_thread(void *arg)
{
l2arc_dev_t *dev = arg;
@@ -10184,7 +9825,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
goto out;
/* Prepare the rebuild process */
- bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
+ memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
/* Start the rebuild process */
for (;;) {
@@ -10230,7 +9871,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
KM_SLEEP);
- bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
+ memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
sizeof (l2arc_log_blkptr_t));
mutex_enter(&dev->l2ad_mtx);
list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
@@ -10268,7 +9909,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
!dev->l2ad_first)
goto out;
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
for (;;) {
mutex_enter(&l2arc_rebuild_thr_lock);
if (dev->l2ad_rebuild_cancel) {
@@ -10328,7 +9969,7 @@ out:
*/
spa_history_log_internal(spa, "L2ARC rebuild", NULL,
"no valid log blocks");
- bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
l2arc_dev_hdr_update(dev);
} else if (err == ECANCELED) {
/*
@@ -10370,8 +10011,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
- ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_SPECULATIVE, B_FALSE));
abd_free(abd);
@@ -10554,7 +10194,7 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
* since we may allocate significant amount of memory here, let ARC
* grow its arc_c.
*/
- arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
+ arc_adapt(log_entries * HDR_L2ONLY_SIZE);
for (int i = log_entries - 1; i >= 0; i--) {
/*
@@ -10691,11 +10331,10 @@ l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
cb->l2rcb_abd = abd_get_from_buf(lb, asize);
pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_DONT_RETRY);
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
- ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
return (pio);
@@ -10761,7 +10400,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
* This function allocates some memory to temporarily hold the serialized
* buffer to be written. This is then released in l2arc_write_done.
*/
-static void
+static uint64_t
l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
{
l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
@@ -10769,12 +10408,11 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
uint64_t psize, asize;
zio_t *wzio;
l2arc_lb_abd_buf_t *abd_buf;
- uint8_t *tmpbuf;
+ uint8_t *tmpbuf = NULL;
l2arc_lb_ptr_buf_t *lb_ptr_buf;
VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
- tmpbuf = zio_buf_alloc(sizeof (*lb));
abd_buf = zio_buf_alloc(sizeof (*abd_buf));
abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
@@ -10793,7 +10431,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
/* try to compress the buffer */
psize = zio_compress_data(ZIO_COMPRESS_LZ4,
- abd_buf->abd, tmpbuf, sizeof (*lb), 0);
+ abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
/* a log block is never entirely zero */
ASSERT(psize != 0);
@@ -10819,13 +10457,13 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
ZIO_CHECKSUM_FLETCHER_4);
if (asize < sizeof (*lb)) {
/* compression succeeded */
- bzero(tmpbuf + psize, asize - psize);
+ memset(tmpbuf + psize, 0, asize - psize);
L2BLK_SET_COMPRESS(
(&l2dhdr->dh_start_lbps[0])->lbp_prop,
ZIO_COMPRESS_LZ4);
} else {
/* compression failed */
- bcopy(lb, tmpbuf, sizeof (*lb));
+ memcpy(tmpbuf, lb, sizeof (*lb));
L2BLK_SET_COMPRESS(
(&l2dhdr->dh_start_lbps[0])->lbp_prop,
ZIO_COMPRESS_OFF);
@@ -10851,7 +10489,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
* Include the committed log block's pointer in the list of pointers
* to log blocks present in the L2ARC device.
*/
- bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
+ memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
sizeof (l2arc_log_blkptr_t));
mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
@@ -10873,6 +10511,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
dev->l2ad_log_ent_idx = 0;
dev->l2ad_log_blk_payload_asize = 0;
dev->l2ad_log_blk_payload_start = 0;
+
+ return (asize);
}
/*
@@ -10940,7 +10580,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
ASSERT(HDR_HAS_L2HDR(hdr));
le = &lb->lb_entries[index];
- bzero(le, sizeof (*le));
+ memset(le, 0, sizeof (*le));
le->le_dva = hdr->b_dva;
le->le_birth = hdr->b_birth;
le->le_daddr = hdr->b_l2hdr.b_daddr;
@@ -10953,7 +10593,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
- L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+ L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
HDR_GET_PSIZE(hdr));
@@ -11009,79 +10649,56 @@ EXPORT_SYMBOL(arc_getbuf_func);
EXPORT_SYMBOL(arc_add_prune_callback);
EXPORT_SYMBOL(arc_remove_prune_callback);
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
- param_get_long, ZMOD_RW, "Min arc size");
+ spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
- param_get_long, ZMOD_RW, "Max arc size");
-
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
- param_get_long, ZMOD_RW, "Metadata limit for arc size");
-
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
- param_set_arc_long, param_get_long, ZMOD_RW,
- "Percent of arc size for arc meta limit");
-
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
- param_get_long, ZMOD_RW, "Min arc metadata");
+ spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
- "Meta objects to scan for prune");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW,
- "Limit number of restarts in arc_evict_meta");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
- "Meta reclaim strategy");
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
+ "Balance between metadata and data on ghost hits.");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
- param_get_int, ZMOD_RW, "Seconds before growing arc size");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
- "Disable arc_p adapt dampener");
+ param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
- param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
+ param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
- "Percent of pagecache to reclaim arc to");
+ "Percent of pagecache to reclaim ARC to");
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
- param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
-
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
"Target average block size");
ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
- "Disable compressed arc buffers");
+ "Disable compressed ARC buffers");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
- param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
+ param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
- param_set_arc_int, param_get_int, ZMOD_RW,
+ param_set_arc_int, param_get_uint, ZMOD_RW,
"Min life of prescient prefetched block in ms");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
"Max write bytes per interval");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
"Extra write bytes during device warmup");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
"Number of max device writes to precache");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
"Compressed l2arc_headroom multiplier");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
"TRIM ahead L2ARC write size multiplier");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
"Seconds between L2ARC writing");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
"Min feed interval in milliseconds");
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
@@ -11093,41 +10710,42 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
"No reads during writes");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
"Percent of ARC size allowed for L2ARC-only headers");
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
"Rebuild the L2ARC when importing a pool");
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
"Min size in bytes to write rebuild log blocks in L2ARC");
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
"Cache only MFU data from ARC into L2ARC");
ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
- "If set to 1 exclude dbufs on special vdevs from being cached to "
- "L2ARC.");
+ "Exclude dbufs on special vdevs from being cached to L2ARC if set.");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
- param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
+ param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
- param_get_long, ZMOD_RW, "System free memory target size in bytes");
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
+ spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
-ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
- param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
+ spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
- param_set_arc_long, param_get_long, ZMOD_RW,
+ param_set_arc_int, param_get_uint, ZMOD_RW,
"Percent of ARC meta buffers for dnodes");
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
"Percentage of excess dnodes to try to unpin");
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
"When full, ARC allocation waits for eviction of this % of alloc size");
-ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
"The number of headers to evict per sublist before moving to the next");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
+ "Number of arc_prune threads");
diff --git a/sys/contrib/openzfs/module/zfs/blake3_zfs.c b/sys/contrib/openzfs/module/zfs/blake3_zfs.c
new file mode 100644
index 000000000000..7783282b671a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/blake3_zfs.c
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+#include <sys/blake3.h>
+#include <sys/abd.h>
+
+static int
+blake3_incremental(void *buf, size_t size, void *arg)
+{
+ BLAKE3_CTX *ctx = arg;
+
+ Blake3_Update(ctx, buf, size);
+
+ return (0);
+}
+
+/*
+ * Computes a native 256-bit BLAKE3 MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using abd_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template,
+ zio_cksum_t *zcp)
+{
+ ASSERT(ctx_template != NULL);
+
+#if defined(_KERNEL)
+ kpreempt_disable();
+ BLAKE3_CTX *ctx = blake3_per_cpu_ctx[CPU_SEQID];
+#else
+ BLAKE3_CTX *ctx = kmem_alloc(sizeof (*ctx), KM_SLEEP);
+#endif
+
+ memcpy(ctx, ctx_template, sizeof (*ctx));
+ (void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx);
+ Blake3_Final(ctx, (uint8_t *)zcp);
+
+#if defined(_KERNEL)
+ kpreempt_enable();
+#else
+ memset(ctx, 0, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+#endif
+}
+
+/*
+ * Byteswapped version of abd_checksum_blake3_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * BLAKE3 is internally endian-insensitive).
+ */
+void
+abd_checksum_blake3_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ ASSERT(ctx_template != NULL);
+
+ abd_checksum_blake3_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a BLAKE3 MAC template suitable for using in BLAKE3 MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+abd_checksum_blake3_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ BLAKE3_CTX *ctx;
+
+ ASSERT(sizeof (salt->zcs_bytes) == 32);
+
+ /* init reference object */
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ Blake3_InitKeyed(ctx, salt->zcs_bytes);
+
+ return (ctx);
+}
+
+/*
+ * Frees a BLAKE3 context template previously allocated using
+ * zio_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_tmpl_free(void *ctx_template)
+{
+ BLAKE3_CTX *ctx = ctx_template;
+
+ memset(ctx, 0, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/contrib/openzfs/module/zfs/blkptr.c b/sys/contrib/openzfs/module/zfs/blkptr.c
index aa09ded8dba3..d85f0737f6f6 100644
--- a/sys/contrib/openzfs/module/zfs/blkptr.c
+++ b/sys/contrib/openzfs/module/zfs/blkptr.c
@@ -58,7 +58,7 @@ encode_embedded_bp_compressed(blkptr_t *bp, void *data,
ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
- bzero(bp, sizeof (*bp));
+ memset(bp, 0, sizeof (*bp));
BP_SET_EMBEDDED(bp, B_TRUE);
BP_SET_COMPRESS(bp, comp);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c
index 47ea364ef26f..da7360f8ce10 100644
--- a/sys/contrib/openzfs/module/zfs/bplist.c
+++ b/sys/contrib/openzfs/module/zfs/bplist.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -65,9 +65,8 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
- while ((bpe = list_head(&bpl->bpl_list))) {
+ while ((bpe = list_remove_head(&bpl->bpl_list))) {
bplist_iterate_last_removed = bpe;
- list_remove(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
func(arg, &bpe->bpe_blk, tx);
kmem_free(bpe, sizeof (*bpe));
@@ -82,10 +81,7 @@ bplist_clear(bplist_t *bpl)
bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
- while ((bpe = list_head(&bpl->bpl_list))) {
- bplist_iterate_last_removed = bpe;
- list_remove(&bpl->bpl_list, bpe);
+ while ((bpe = list_remove_head(&bpl->bpl_list)))
kmem_free(bpe, sizeof (*bpe));
- }
mutex_exit(&bpl->bpl_lock);
}
diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c
index e75ba5cccde6..96e1601c4e9c 100644
--- a/sys/contrib/openzfs/module/zfs/bpobj.c
+++ b/sys/contrib/openzfs/module/zfs/bpobj.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -156,7 +156,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
if (err)
return (err);
- bzero(bpo, sizeof (*bpo));
+ memset(bpo, 0, sizeof (*bpo));
mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
ASSERT(bpo->bpo_dbuf == NULL);
@@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
dmu_buf_t *dbuf = NULL;
bpobj_t *bpo = bpi->bpi_bpo;
- for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+ int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
+ uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
+ sizeof (blkptr_t);
+ uint64_t ps = start * sizeof (blkptr_t);
+ uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
+ ps);
+ if (pe > pb) {
+ dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+ for (; i >= start; i--) {
uint64_t offset = i * sizeof (blkptr_t);
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
@@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
- offset, FTAG, &dbuf, 0);
+ offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
if (err)
break;
+ pe = pb;
+ pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
+ dbuf->db_offset - dmu_prefetch_max : 0, ps);
+ if (pe > pb) {
+ dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+ pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
+ }
}
ASSERT3U(offset, >=, dbuf->db_offset);
@@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
uint64_t offset = i * sizeof (uint64_t);
- uint64_t obj_from_sublist;
+ uint64_t subobj;
err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
- offset, sizeof (uint64_t), &obj_from_sublist,
- DMU_READ_PREFETCH);
+ offset, sizeof (uint64_t), &subobj,
+ DMU_READ_NO_PREFETCH);
if (err)
break;
- bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
- KM_SLEEP);
- err = bpobj_open(sublist, bpo->bpo_os,
- obj_from_sublist);
- if (err)
+ bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
+ KM_SLEEP);
+ err = bpobj_open(subbpo, bpo->bpo_os, subobj);
+ if (err) {
+ kmem_free(subbpo, sizeof (bpobj_t));
break;
+ }
- list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
- mutex_enter(&sublist->bpo_lock);
+ if (subbpo->bpo_havesubobj &&
+ subbpo->bpo_phys->bpo_subobjs != 0) {
+ dmu_prefetch(subbpo->bpo_os,
+ subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+
+ list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
+ mutex_enter(&subbpo->bpo_lock);
bpi->bpi_unprocessed_subobjs--;
}
}
@@ -663,14 +688,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
}
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
- VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
-
if (bpobj_is_empty(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
return;
}
+ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
mutex_enter(&bpo->bpo_lock);
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
@@ -780,6 +804,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
}
+/*
+ * Prefetch metadata required for bpobj_enqueue_subobj().
+ */
+void
+bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj)
+{
+ dmu_object_info_t doi;
+ bpobj_t subbpo;
+ uint64_t subsubobjs;
+ boolean_t copy_subsub = B_TRUE;
+ boolean_t copy_bps = B_TRUE;
+
+ ASSERT(bpobj_is_open(bpo));
+ ASSERT(subobj != 0);
+
+ if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj)
+ return;
+
+ if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0)
+ return;
+ if (bpobj_is_empty(&subbpo)) {
+ bpobj_close(&subbpo);
+ return;
+ }
+ subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+ bpobj_close(&subbpo);
+
+ if (subsubobjs != 0) {
+ if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0)
+ return;
+ if (doi.doi_max_offset > doi.doi_data_block_size)
+ copy_subsub = B_FALSE;
+ }
+
+ if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0)
+ return;
+ if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub)
+ copy_bps = B_FALSE;
+
+ if (copy_subsub && subsubobjs != 0) {
+ if (bpo->bpo_phys->bpo_subobjs) {
+ dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+ dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1,
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+
+ if (copy_bps) {
+ dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+ bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1,
+ ZIO_PRIORITY_ASYNC_READ);
+ dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1,
+ ZIO_PRIORITY_ASYNC_READ);
+ } else if (bpo->bpo_phys->bpo_subobjs) {
+ dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1,
+ ZIO_PRIORITY_ASYNC_READ);
+ }
+}
+
void
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
@@ -805,12 +891,12 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
* set of BP's stored, and bpobj_iterate() wouldn't visit
* all the space accounted for in the bpobj.
*/
- bzero(&stored_bp, sizeof (stored_bp));
+ memset(&stored_bp, 0, sizeof (stored_bp));
stored_bp.blk_prop = bp->blk_prop;
- stored_bp.blk_birth = bp->blk_birth;
+ BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
} else if (!BP_GET_DEDUP(bp)) {
/* The bpobj will compress better without the checksum */
- bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+ memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
}
stored_bp.blk_fill = 0;
@@ -829,6 +915,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
offset, bpo, &bpo->bpo_cached_dbuf, 0));
+ ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL);
}
dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
@@ -860,13 +947,14 @@ struct space_range_arg {
uint64_t uncomp;
};
-/* ARGSUSED */
static int
space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
{
+ (void) bp_freed, (void) tx;
struct space_range_arg *sra = arg;
- if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+ if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
+ BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
sra->used += bp_get_dsize_sync(sra->spa, bp);
else
@@ -898,7 +986,7 @@ bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
/*
* Return the amount of space in the bpobj which is:
- * mintxg < blk_birth <= maxtxg
+ * mintxg < logical birth <= maxtxg
*/
int
bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
@@ -932,11 +1020,11 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
* bpobj are designated as free or allocated that information is not preserved
* in bplists.
*/
-/* ARGSUSED */
int
bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
+ (void) bp_freed, (void) tx;
bplist_t *bpl = arg;
bplist_append(bpl, bp);
return (0);
diff --git a/sys/contrib/openzfs/module/zfs/bptree.c b/sys/contrib/openzfs/module/zfs/bptree.c
index 1827a3c4e326..1f5d8e77bcc0 100644
--- a/sys/contrib/openzfs/module/zfs/bptree.c
+++ b/sys/contrib/openzfs/module/zfs/bptree.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -147,11 +147,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
dmu_buf_rele(db, FTAG);
}
-/* ARGSUSED */
static int
bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
+ (void) zilog, (void) dnp;
int err;
struct bptree_args *ba = arg;
diff --git a/sys/contrib/openzfs/module/zfs/bqueue.c b/sys/contrib/openzfs/module/zfs/bqueue.c
index 22539efc4e23..a7fa516975de 100644
--- a/sys/contrib/openzfs/module/zfs/bqueue.c
+++ b/sys/contrib/openzfs/module/zfs/bqueue.c
@@ -27,34 +27,46 @@ obj2node(bqueue_t *q, void *data)
/*
* Initialize a blocking queue The maximum capacity of the queue is set to
- * size. Types that are stored in a bqueue must contain a bqueue_node_t,
- * and node_offset must be its offset from the start of the struct.
- * fill_fraction is a performance tuning value; when the queue is full, any
- * threads attempting to enqueue records will block. They will block until
- * they're signaled, which will occur when the queue is at least 1/fill_fraction
+ * size. Types that are stored in a bqueue must contain a bqueue_node_t, and
+ * node_offset must be its offset from the start of the struct. fill_fraction
+ * is a performance tuning value; when the queue is full, any threads
+ * attempting to enqueue records will block. They will block until they're
+ * signaled, which will occur when the queue is at least 1/fill_fraction
* empty. Similar behavior occurs on dequeue; if the queue is empty, threads
- * block. They will be signalled when the queue has 1/fill_fraction full, or
- * when bqueue_flush is called. As a result, you must call bqueue_flush when
- * you enqueue your final record on a thread, in case the dequeueing threads are
- * currently blocked and that enqueue does not cause them to be awoken.
- * Alternatively, this behavior can be disabled (causing signaling to happen
- * immediately) by setting fill_fraction to any value larger than size.
- * Return 0 on success, or -1 on failure.
+ * block. They will be signalled when the queue has 1/fill_fraction full.
+ * As a result, you must call bqueue_enqueue_flush() when you enqueue your
+ * final record on a thread, in case the dequeuing threads are currently
+ * blocked and that enqueue does not cause them to be woken. Alternatively,
+ * this behavior can be disabled (causing signaling to happen immediately) by
+ * setting fill_fraction to any value larger than size. Return 0 on success,
+ * or -1 on failure.
+ *
+ * Note: The caller must ensure that for a given bqueue_t, there's only a
+ * single call to bqueue_enqueue() running at a time (e.g. by calling only
+ * from a single thread, or with locking around the call). Similarly, the
+ * caller must ensure that there's only a single call to bqueue_dequeue()
+ * running at a time. However, the one call to bqueue_enqueue() may be
+ * invoked concurrently with the one call to bqueue_dequeue().
*/
int
-bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size,
- size_t node_offset)
+bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset)
{
if (fill_fraction == 0) {
return (-1);
}
list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
node_offset + offsetof(bqueue_node_t, bqn_node));
+ list_create(&q->bq_dequeuing_list, node_offset + sizeof (bqueue_node_t),
+ node_offset + offsetof(bqueue_node_t, bqn_node));
+ list_create(&q->bq_enqueuing_list, node_offset + sizeof (bqueue_node_t),
+ node_offset + offsetof(bqueue_node_t, bqn_node));
cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
q->bq_node_offset = node_offset;
q->bq_size = 0;
+ q->bq_dequeuing_size = 0;
+ q->bq_enqueuing_size = 0;
q->bq_maxsize = size;
q->bq_fill_fraction = fill_fraction;
return (0);
@@ -70,31 +82,40 @@ bqueue_destroy(bqueue_t *q)
{
mutex_enter(&q->bq_lock);
ASSERT0(q->bq_size);
+ ASSERT0(q->bq_dequeuing_size);
+ ASSERT0(q->bq_enqueuing_size);
cv_destroy(&q->bq_add_cv);
cv_destroy(&q->bq_pop_cv);
list_destroy(&q->bq_list);
+ list_destroy(&q->bq_dequeuing_list);
+ list_destroy(&q->bq_enqueuing_list);
mutex_exit(&q->bq_lock);
mutex_destroy(&q->bq_lock);
}
static void
-bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
- boolean_t flush)
+bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush)
{
ASSERT3U(item_size, >, 0);
ASSERT3U(item_size, <=, q->bq_maxsize);
- mutex_enter(&q->bq_lock);
+
obj2node(q, data)->bqn_size = item_size;
- while (q->bq_size + item_size > q->bq_maxsize) {
- cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
- }
- q->bq_size += item_size;
- list_insert_tail(&q->bq_list, data);
- if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction)
- cv_signal(&q->bq_pop_cv);
- if (flush)
+ q->bq_enqueuing_size += item_size;
+ list_insert_tail(&q->bq_enqueuing_list, data);
+
+ if (flush ||
+ q->bq_enqueuing_size >= q->bq_maxsize / q->bq_fill_fraction) {
+ /* Append the enquing list to the shared list. */
+ mutex_enter(&q->bq_lock);
+ while (q->bq_size > q->bq_maxsize) {
+ cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
+ }
+ q->bq_size += q->bq_enqueuing_size;
+ list_move_tail(&q->bq_list, &q->bq_enqueuing_list);
+ q->bq_enqueuing_size = 0;
cv_broadcast(&q->bq_pop_cv);
- mutex_exit(&q->bq_lock);
+ mutex_exit(&q->bq_lock);
+ }
}
/*
@@ -103,7 +124,7 @@ bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
* > 0.
*/
void
-bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+bqueue_enqueue(bqueue_t *q, void *data, size_t item_size)
{
bqueue_enqueue_impl(q, data, item_size, B_FALSE);
}
@@ -112,12 +133,12 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
* Enqueue an entry, and then flush the queue. This forces the popping threads
* to wake up, even if we're below the fill fraction. We have this in a single
* function, rather than having a separate call, because it prevents race
- * conditions between the enqueuing thread and the dequeueing thread, where the
- * enqueueing thread will wake up the dequeueing thread, that thread will
+ * conditions between the enqueuing thread and the dequeuing thread, where the
+ * enqueueing thread will wake up the dequeuing thread, that thread will
* destroy the condvar before the enqueuing thread is done.
*/
void
-bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size)
+bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size)
{
bqueue_enqueue_impl(q, data, item_size, B_TRUE);
}
@@ -129,27 +150,26 @@ bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size)
void *
bqueue_dequeue(bqueue_t *q)
{
- void *ret = NULL;
- uint64_t item_size;
- mutex_enter(&q->bq_lock);
- while (q->bq_size == 0) {
- cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);
+ void *ret = list_remove_head(&q->bq_dequeuing_list);
+ if (ret == NULL) {
+ /*
+ * Dequeuing list is empty. Wait for there to be something on
+ * the shared list, then move the entire shared list to the
+ * dequeuing list.
+ */
+ mutex_enter(&q->bq_lock);
+ while (q->bq_size == 0) {
+ cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);
+ }
+ ASSERT0(q->bq_dequeuing_size);
+ ASSERT(list_is_empty(&q->bq_dequeuing_list));
+ list_move_tail(&q->bq_dequeuing_list, &q->bq_list);
+ q->bq_dequeuing_size = q->bq_size;
+ q->bq_size = 0;
+ cv_broadcast(&q->bq_add_cv);
+ mutex_exit(&q->bq_lock);
+ ret = list_remove_head(&q->bq_dequeuing_list);
}
- ret = list_remove_head(&q->bq_list);
- ASSERT3P(ret, !=, NULL);
- item_size = obj2node(q, ret)->bqn_size;
- q->bq_size -= item_size;
- if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction))
- cv_signal(&q->bq_add_cv);
- mutex_exit(&q->bq_lock);
+ q->bq_dequeuing_size -= obj2node(q, ret)->bqn_size;
return (ret);
}
-
-/*
- * Returns true if the space used is 0.
- */
-boolean_t
-bqueue_empty(bqueue_t *q)
-{
- return (q->bq_size == 0);
-}
diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c
new file mode 100644
index 000000000000..ea8c0735c4b7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/brt.c
@@ -0,0 +1,1673 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/brt.h>
+#include <sys/brt_impl.h>
+#include <sys/ddt.h>
+#include <sys/bitmap.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/kstat.h>
+#include <sys/wmsum.h>
+
+/*
+ * Block Cloning design.
+ *
+ * Block Cloning allows to manually clone a file (or a subset of its blocks)
+ * into another (or the same) file by just creating additional references to
+ * the data blocks without copying the data itself. Those references are kept
+ * in the Block Reference Tables (BRTs).
+ *
+ * In many ways this is similar to the existing deduplication, but there are
+ * some important differences:
+ *
+ * - Deduplication is automatic and Block Cloning is not - one has to use a
+ * dedicated system call(s) to clone the given file/blocks.
+ * - Deduplication keeps all data blocks in its table, even those referenced
+ * just once. Block Cloning creates an entry in its tables only when there
+ * are at least two references to the given data block. If the block was
+ * never explicitly cloned or the second to last reference was dropped,
+ * there will be neither space nor performance overhead.
+ * - Deduplication needs data to work - one needs to pass real data to the
+ * write(2) syscall, so hash can be calculated. Block Cloning doesn't require
+ * data, just block pointers to the data, so it is extremely fast, as we pay
+ * neither the cost of reading the data, nor the cost of writing the data -
+ * we operate exclusively on metadata.
+ * - If the D (dedup) bit is not set in the block pointer, it means that
+ * the block is not in the dedup table (DDT) and we won't consult the DDT
+ * when we need to free the block. Block Cloning must be consulted on every
+ * free, because we cannot modify the source BP (eg. by setting something
+ * similar to the D bit), thus we have no hint if the block is in the
+ * Block Reference Table (BRT), so we need to look into the BRT. There is
+ * an optimization in place that allows us to eliminate the majority of BRT
+ * lookups which is described below in the "Minimizing free penalty" section.
+ * - The BRT entry is much smaller than the DDT entry - for BRT we only store
+ * 64bit offset and 64bit reference counter.
+ * - Dedup keys are cryptographic hashes, so two blocks that are close to each
+ * other on disk are most likely in totally different parts of the DDT.
+ * The BRT entry keys are offsets into a single top-level VDEV, so data blocks
+ * from one file should have BRT entries close to each other.
+ * - Scrub will only do a single pass over a block that is referenced multiple
+ * times in the DDT. Unfortunately it is not currently (if at all) possible
+ * with Block Cloning and block referenced multiple times will be scrubbed
+ * multiple times. The new, sorted scrub should be able to eliminate
+ * duplicated reads given enough memory.
+ * - Deduplication requires cryptographically strong hash as a checksum or
+ * additional data verification. Block Cloning works with any checksum
+ * algorithm or even with checksumming disabled.
+ *
+ * As mentioned above, the BRT entries are much smaller than the DDT entries.
+ * To uniquely identify a block we just need its vdev id and offset. We also
+ * need to maintain a reference counter. The vdev id will often repeat, as there
+ * is a small number of top-level VDEVs and a large number of blocks stored in
+ * each VDEV. We take advantage of that to reduce the BRT entry size further by
+ * maintaining one BRT for each top-level VDEV, so we can then have only offset
+ * and counter as the BRT entry.
+ *
+ * Minimizing free penalty.
+ *
+ * Block Cloning allows creating additional references to any existing block.
+ * When we free a block there is no hint in the block pointer whether the block
+ * was cloned or not, so on each free we have to check if there is a
+ * corresponding entry in the BRT or not. If there is, we need to decrease
+ * the reference counter. Doing BRT lookup on every free can potentially be
+ * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
+ * This is the main problem with deduplication, so we've learned our lesson and
+ * try not to repeat the same mistake here. How do we do that? We divide each
+ * top-level VDEV into 16MB regions. For each region we maintain a counter that
+ * is a sum of all the BRT entries that have offsets within the region. This
+ * creates the entries count array of 16bit numbers for each top-level VDEV.
+ * The entries count array is always kept in memory and updated on disk in the
+ * same transaction group as the BRT updates to keep everything in-sync. We can
+ * keep the array in memory, because it is very small. With 16MB regions and
+ * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
+ * the region size even further in the future). Now, when we want to free
+ * a block, we first consult the array. If the counter for the whole region is
+ * zero, there is no need to look for the BRT entry, as there isn't one for
+ * sure. If the counter for the region is greater than zero, only then we will
+ * do a BRT lookup and if an entry is found we will decrease the reference
+ * counter in the BRT entry and in the entry counters array.
+ *
+ * The entry counters array is small, but can potentially be larger for very
+ * large VDEVs or smaller regions. In this case we don't want to rewrite entire
+ * array on every change. We then divide the array into 32kB block and keep
+ * a bitmap of dirty blocks within a transaction group. When we sync the
+ * transaction group we can only update the parts of the entry counters array
+ * that were modified. Note: Keeping track of the dirty parts of the entry
+ * counters array is implemented, but updating only parts of the array on disk
+ * is not yet implemented - for now we will update entire array if there was
+ * any change.
+ *
+ * The implementation tries to be economic: if BRT is not used, or no longer
+ * used, there will be no entries in the MOS and no additional memory used (eg.
+ * the entry counters array is only allocated if needed).
+ *
+ * Interaction between Deduplication and Block Cloning.
+ *
+ * If both functionalities are in use, we could end up with a block that is
+ * referenced multiple times in both DDT and BRT. When we free one of the
+ * references we couldn't tell where it belongs, so we would have to decide
+ * what table takes the precedence: do we first clear DDT references or BRT
+ * references? To avoid this dilemma BRT cooperates with DDT - if a given block
+ * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
+ * lookup DDT entry instead and increase the counter there. No BRT entry
+ * will be created for a block which has the D (dedup) bit set.
+ * BRT may be more efficient for manual deduplication, but if the block is
+ * already in the DDT, then creating additional BRT entry would be less
+ * efficient. This clever idea was proposed by Allan Jude.
+ *
+ * Block Cloning across datasets.
+ *
+ * Block Cloning is not limited to cloning blocks within the same dataset.
+ * It is possible (and very useful) to clone blocks between different datasets.
+ * One use case is recovering files from snapshots. By cloning the files into
+ * dataset we need no additional storage. Without Block Cloning we would need
+ * additional space for those files.
+ * Another interesting use case is moving the files between datasets
+ * (copying the file content to the new dataset and removing the source file).
+ * In that case Block Cloning will only be used briefly, because the BRT entries
+ * will be removed when the source is removed.
+ * Block Cloning across encrypted datasets is supported as long as both
+ * datasets share the same master key (e.g. snapshots and clones)
+ *
+ * Block Cloning flow through ZFS layers.
+ *
+ * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
+ * blocks. As of this writing no interface is implemented that allows for block
+ * cloning within a ZVOL.
+ * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
+ * for blocking cloning.
+ *
+ * ssize_t
+ * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
+ * size_t len, unsigned int flags);
+ *
+ * Even though offsets and length represent bytes, they have to be
+ * block-aligned or we will return an error so the upper layer can
+ * fallback to the generic mechanism that will just copy the data.
+ * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
+ * This function was implemented based on zfs_write(), but instead of writing
+ * the given data we first read block pointers using the new dmu_read_l0_bps()
+ * function from the source file. Once we have BPs from the source file we call
+ * the dmu_brt_clone() function on the destination file. This function
+ * allocates BPs for us. We iterate over all source BPs. If the given BP is
+ * a hole or an embedded block, we just copy BP as-is. If it points to a real
+ * data we place this BP on a BRT pending list using the brt_pending_add()
+ * function.
+ *
+ * We use this pending list to keep track of all BPs that got new references
+ * within this transaction group.
+ *
+ * Some special cases to consider and how we address them:
+ * - The block we want to clone may have been created within the same
+ * transaction group that we are trying to clone. Such block has no BP
+ * allocated yet, so cannot be immediately cloned. We return EAGAIN.
+ * - The block we want to clone may have been modified within the same
+ * transaction group. We return EAGAIN.
+ * - A block may be cloned multiple times during one transaction group (that's
+ * why pending list is actually a tree and not an append-only list - this
+ * way we can figure out faster if this block is cloned for the first time
+ * in this txg or consecutive time).
+ * - A block may be cloned and freed within the same transaction group
+ * (see dbuf_undirty()).
+ * - A block may be cloned and within the same transaction group the clone
+ * can be cloned again (see dmu_read_l0_bps()).
+ * - A file might have been deleted, but the caller still has a file descriptor
+ * open to this file and clones it.
+ *
+ * When we free a block we have an additional step in the ZIO pipeline where we
+ * call the zio_brt_free() function. We then call the brt_entry_decref()
+ * that loads the corresponding BRT entry (if one exists) and decreases
+ * reference counter. If this is not the last reference we will stop ZIO
+ * pipeline here. If this is the last reference or the block is not in the
+ * BRT, we continue the pipeline and free the block as usual.
+ *
+ * At the beginning of spa_sync() where there can be no more block cloning,
+ * but before issuing frees we call brt_pending_apply(). This function applies
+ * all the new clones to the BRT table - we load BRT entries and update
+ * reference counters. To sync new BRT entries to disk, we use brt_sync()
+ * function. This function will sync all dirty per-top-level-vdev BRTs,
+ * the entry counters arrays, etc.
+ *
+ * Block Cloning and ZIL.
+ *
+ * Every clone operation is divided into chunks (similar to write) and each
+ * chunk is cloned in a separate transaction. The chunk size is determined by
+ * how many BPs we can fit into a single ZIL entry.
+ * Replaying clone operation is different from the regular clone operation,
+ * as when we log clone operations we cannot use the source object - it may
+ * reside on a different dataset, so we log BPs we want to clone.
+ * The ZIL is replayed when we mount the given dataset, not when the pool is
+ * imported. Taking this into account it is possible that the pool is imported
+ * without mounting datasets and the source dataset is destroyed before the
+ * destination dataset is mounted and its ZIL replayed.
+ * To address this situation we leverage zil_claim() mechanism where ZFS will
+ * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
+ * entries, we will bump reference counters for their BPs in the BRT. Then
+ * on mount and ZIL replay we bump the reference counters once more, while the
+ * first references are dropped during ZIL destroy by zil_free_clone_range().
+ * It is possible that after zil_claim() we never mount the destination, so
+ * we never replay its ZIL and just destroy it. In this case the only taken
+ * references will be dropped by zil_free_clone_range(), since the cloning is
+ * not going to ever take place.
+ */
+
+static kmem_cache_t *brt_entry_cache;
+static kmem_cache_t *brt_pending_entry_cache;
+
+/*
+ * Enable/disable prefetching of BRT entries that we are going to modify.
+ */
+static int brt_zap_prefetch = 1;
+
+#ifdef ZFS_DEBUG
+#define BRT_DEBUG(...) do { \
+ if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \
+ __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
+ } \
+} while (0)
+#else
+#define BRT_DEBUG(...) do { } while (0)
+#endif
+
+static int brt_zap_default_bs = 12;
+static int brt_zap_default_ibs = 12;
+
+static kstat_t *brt_ksp;
+
+typedef struct brt_stats {
+ kstat_named_t brt_addref_entry_in_memory;
+ kstat_named_t brt_addref_entry_not_on_disk;
+ kstat_named_t brt_addref_entry_on_disk;
+ kstat_named_t brt_addref_entry_read_lost_race;
+ kstat_named_t brt_decref_entry_in_memory;
+ kstat_named_t brt_decref_entry_loaded_from_disk;
+ kstat_named_t brt_decref_entry_not_in_memory;
+ kstat_named_t brt_decref_entry_not_on_disk;
+ kstat_named_t brt_decref_entry_read_lost_race;
+ kstat_named_t brt_decref_entry_still_referenced;
+ kstat_named_t brt_decref_free_data_later;
+ kstat_named_t brt_decref_free_data_now;
+ kstat_named_t brt_decref_no_entry;
+} brt_stats_t;
+
+static brt_stats_t brt_stats = {
+ { "addref_entry_in_memory", KSTAT_DATA_UINT64 },
+ { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 },
+ { "addref_entry_on_disk", KSTAT_DATA_UINT64 },
+ { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 },
+ { "decref_entry_in_memory", KSTAT_DATA_UINT64 },
+ { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 },
+ { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 },
+ { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 },
+ { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 },
+ { "decref_entry_still_referenced", KSTAT_DATA_UINT64 },
+ { "decref_free_data_later", KSTAT_DATA_UINT64 },
+ { "decref_free_data_now", KSTAT_DATA_UINT64 },
+ { "decref_no_entry", KSTAT_DATA_UINT64 }
+};
+
+struct {
+ wmsum_t brt_addref_entry_in_memory;
+ wmsum_t brt_addref_entry_not_on_disk;
+ wmsum_t brt_addref_entry_on_disk;
+ wmsum_t brt_addref_entry_read_lost_race;
+ wmsum_t brt_decref_entry_in_memory;
+ wmsum_t brt_decref_entry_loaded_from_disk;
+ wmsum_t brt_decref_entry_not_in_memory;
+ wmsum_t brt_decref_entry_not_on_disk;
+ wmsum_t brt_decref_entry_read_lost_race;
+ wmsum_t brt_decref_entry_still_referenced;
+ wmsum_t brt_decref_free_data_later;
+ wmsum_t brt_decref_free_data_now;
+ wmsum_t brt_decref_no_entry;
+} brt_sums;
+
+#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1)
+
+static int brt_entry_compare(const void *x1, const void *x2);
+static int brt_pending_entry_compare(const void *x1, const void *x2);
+
+static void
+brt_rlock(brt_t *brt)
+{
+ rw_enter(&brt->brt_lock, RW_READER);
+}
+
+static void
+brt_wlock(brt_t *brt)
+{
+ rw_enter(&brt->brt_lock, RW_WRITER);
+}
+
+static void
+brt_unlock(brt_t *brt)
+{
+ rw_exit(&brt->brt_lock);
+}
+
+static uint16_t
+brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
+{
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ if (unlikely(brtvd->bv_need_byteswap)) {
+ return (BSWAP_16(brtvd->bv_entcount[idx]));
+ } else {
+ return (brtvd->bv_entcount[idx]);
+ }
+}
+
+static void
+brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
+{
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ if (unlikely(brtvd->bv_need_byteswap)) {
+ brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
+ } else {
+ brtvd->bv_entcount[idx] = entcnt;
+ }
+}
+
+static void
+brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
+{
+ uint16_t entcnt;
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ entcnt = brt_vdev_entcount_get(brtvd, idx);
+ ASSERT(entcnt < UINT16_MAX);
+
+ brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
+}
+
+static void
+brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
+{
+ uint16_t entcnt;
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ entcnt = brt_vdev_entcount_get(brtvd, idx);
+ ASSERT(entcnt > 0);
+
+ brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
+}
+
+#ifdef ZFS_DEBUG
+static void
+brt_vdev_dump(brt_vdev_t *brtvd)
+{
+ uint64_t idx;
+
+ zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
+ "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
+ (u_longlong_t)brtvd->bv_vdevid,
+ brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
+ (u_longlong_t)brtvd->bv_size,
+ (u_longlong_t)brtvd->bv_totalcount,
+ (u_longlong_t)brtvd->bv_nblocks,
+ (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
+ if (brtvd->bv_totalcount > 0) {
+ zfs_dbgmsg(" entcounts:");
+ for (idx = 0; idx < brtvd->bv_size; idx++) {
+ uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
+ if (entcnt > 0) {
+ zfs_dbgmsg(" [%04llu] %hu",
+ (u_longlong_t)idx, entcnt);
+ }
+ }
+ }
+ if (brtvd->bv_entcount_dirty) {
+ char *bitmap;
+
+ bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
+ for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
+ bitmap[idx] =
+ BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
+ }
+ bitmap[idx] = '\0';
+ zfs_dbgmsg(" dirty: %s", bitmap);
+ kmem_free(bitmap, brtvd->bv_nblocks + 1);
+ }
+}
+#endif
+
+static brt_vdev_t *
+brt_vdev(brt_t *brt, uint64_t vdevid)
+{
+ brt_vdev_t *brtvd;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+ if (vdevid < brt->brt_nvdevs) {
+ brtvd = &brt->brt_vdevs[vdevid];
+ } else {
+ brtvd = NULL;
+ }
+
+ return (brtvd);
+}
+
+static void
+brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+ char name[64];
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT0(brtvd->bv_mos_brtvdev);
+ ASSERT0(brtvd->bv_mos_entries);
+ ASSERT(brtvd->bv_entcount != NULL);
+ ASSERT(brtvd->bv_size > 0);
+ ASSERT(brtvd->bv_bitmap != NULL);
+ ASSERT(brtvd->bv_nblocks > 0);
+
+ brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
+ ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
+ brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
+ VERIFY(brtvd->bv_mos_entries != 0);
+ BRT_DEBUG("MOS entries created, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_entries);
+
+ /*
+ * We allocate DMU buffer to store the bv_entcount[] array.
+ * We will keep array size (bv_size) and cummulative count for all
+ * bv_entcount[]s (bv_totalcount) in the bonus buffer.
+ */
+ brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
+ DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
+ DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
+ VERIFY(brtvd->bv_mos_brtvdev != 0);
+ BRT_DEBUG("MOS BRT VDEV created, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_brtvdev);
+
+ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+ (u_longlong_t)brtvd->bv_vdevid);
+ VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
+ BRT_DEBUG("Pool directory object created, object=%s", name);
+
+ spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+ vdev_t *vd;
+ uint16_t *entcount;
+ ulong_t *bitmap;
+ uint64_t nblocks, size;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+
+ spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
+ size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
+ spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
+
+ entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
+ nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
+ bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
+
+ if (!brtvd->bv_initiated) {
+ ASSERT0(brtvd->bv_size);
+ ASSERT(brtvd->bv_entcount == NULL);
+ ASSERT(brtvd->bv_bitmap == NULL);
+ ASSERT0(brtvd->bv_nblocks);
+
+ avl_create(&brtvd->bv_tree, brt_entry_compare,
+ sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
+ } else {
+ ASSERT(brtvd->bv_size > 0);
+ ASSERT(brtvd->bv_entcount != NULL);
+ ASSERT(brtvd->bv_bitmap != NULL);
+ ASSERT(brtvd->bv_nblocks > 0);
+ /*
+ * TODO: Allow vdev shrinking. We only need to implement
+ * shrinking the on-disk BRT VDEV object.
+ * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
+ * size, tx);
+ */
+ ASSERT3U(brtvd->bv_size, <=, size);
+
+ memcpy(entcount, brtvd->bv_entcount,
+ sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
+ memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
+ BT_SIZEOFMAP(brtvd->bv_nblocks)));
+ vmem_free(brtvd->bv_entcount,
+ sizeof (entcount[0]) * brtvd->bv_size);
+ kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ }
+
+ brtvd->bv_size = size;
+ brtvd->bv_entcount = entcount;
+ brtvd->bv_bitmap = bitmap;
+ brtvd->bv_nblocks = nblocks;
+ if (!brtvd->bv_initiated) {
+ brtvd->bv_need_byteswap = FALSE;
+ brtvd->bv_initiated = TRUE;
+ BRT_DEBUG("BRT VDEV %llu initiated.",
+ (u_longlong_t)brtvd->bv_vdevid);
+ }
+}
+
+static void
+brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
+{
+ char name[64];
+ dmu_buf_t *db;
+ brt_vdev_phys_t *bvphys;
+ int error;
+
+ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+ (u_longlong_t)brtvd->bv_vdevid);
+ error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
+ if (error != 0)
+ return;
+ ASSERT(brtvd->bv_mos_brtvdev != 0);
+
+ error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
+ ASSERT0(error);
+ if (error != 0)
+ return;
+
+ bvphys = db->db_data;
+ if (brt->brt_rangesize == 0) {
+ brt->brt_rangesize = bvphys->bvp_rangesize;
+ } else {
+ ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
+ }
+
+ ASSERT(!brtvd->bv_initiated);
+ brt_vdev_realloc(brt, brtvd);
+
+ /* TODO: We don't support VDEV shrinking. */
+ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
+
+ /*
+ * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
+ */
+ error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+ MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
+ brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
+ ASSERT0(error);
+
+ brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
+ ASSERT(brtvd->bv_mos_entries != 0);
+ brtvd->bv_need_byteswap =
+ (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
+ brtvd->bv_totalcount = bvphys->bvp_totalcount;
+ brtvd->bv_usedspace = bvphys->bvp_usedspace;
+ brtvd->bv_savedspace = bvphys->bvp_savedspace;
+ brt->brt_usedspace += brtvd->bv_usedspace;
+ brt->brt_savedspace += brtvd->bv_savedspace;
+
+ dmu_buf_rele(db, FTAG);
+
+ BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
+ name, (u_longlong_t)brtvd->bv_mos_brtvdev,
+ (u_longlong_t)brtvd->bv_mos_entries);
+}
+
+static void
+brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_initiated);
+
+ vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
+ brtvd->bv_entcount = NULL;
+ kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ brtvd->bv_bitmap = NULL;
+ ASSERT0(avl_numnodes(&brtvd->bv_tree));
+ avl_destroy(&brtvd->bv_tree);
+
+ brtvd->bv_size = 0;
+ brtvd->bv_nblocks = 0;
+
+ brtvd->bv_initiated = FALSE;
+ BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
+}
+
+static void
+brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+ char name[64];
+ uint64_t count;
+ dmu_buf_t *db;
+ brt_vdev_phys_t *bvphys;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd->bv_mos_brtvdev != 0);
+ ASSERT(brtvd->bv_mos_entries != 0);
+
+ VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
+ VERIFY0(count);
+ VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
+ BRT_DEBUG("MOS entries destroyed, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_entries);
+ brtvd->bv_mos_entries = 0;
+
+ VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+ bvphys = db->db_data;
+ ASSERT0(bvphys->bvp_totalcount);
+ ASSERT0(bvphys->bvp_usedspace);
+ ASSERT0(bvphys->bvp_savedspace);
+ dmu_buf_rele(db, FTAG);
+
+ VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
+ BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
+ (u_longlong_t)brtvd->bv_mos_brtvdev);
+ brtvd->bv_mos_brtvdev = 0;
+
+ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+ (u_longlong_t)brtvd->bv_vdevid);
+ VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+ BRT_DEBUG("Pool directory object removed, object=%s", name);
+
+ brt_vdev_dealloc(brt, brtvd);
+
+ spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
+{
+ brt_vdev_t *brtvd, *vdevs;
+ uint64_t vdevid;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT3U(nvdevs, >, brt->brt_nvdevs);
+
+ vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
+ if (brt->brt_nvdevs > 0) {
+ ASSERT(brt->brt_vdevs != NULL);
+
+ memcpy(vdevs, brt->brt_vdevs,
+ sizeof (brt_vdev_t) * brt->brt_nvdevs);
+ kmem_free(brt->brt_vdevs,
+ sizeof (brt_vdev_t) * brt->brt_nvdevs);
+ }
+ for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
+ brtvd = &vdevs[vdevid];
+
+ brtvd->bv_vdevid = vdevid;
+ brtvd->bv_initiated = FALSE;
+ }
+
+ BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
+ (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
+
+ brt->brt_vdevs = vdevs;
+ brt->brt_nvdevs = nvdevs;
+}
+
+static boolean_t
+brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
+{
+ uint64_t idx;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+ idx = bre->bre_offset / brt->brt_rangesize;
+ if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
+ /* VDEV wasn't expanded. */
+ return (brt_vdev_entcount_get(brtvd, idx) > 0);
+ }
+
+ return (FALSE);
+}
+
+static void
+brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+ uint64_t dsize)
+{
+ uint64_t idx;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+ ASSERT(brtvd != NULL);
+ ASSERT(brtvd->bv_entcount != NULL);
+
+ brt->brt_savedspace += dsize;
+ brtvd->bv_savedspace += dsize;
+ brtvd->bv_meta_dirty = TRUE;
+
+ if (bre->bre_refcount > 1) {
+ return;
+ }
+
+ brt->brt_usedspace += dsize;
+ brtvd->bv_usedspace += dsize;
+
+ idx = bre->bre_offset / brt->brt_rangesize;
+ if (idx >= brtvd->bv_size) {
+ /* VDEV has been expanded. */
+ brt_vdev_realloc(brt, brtvd);
+ }
+
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ brtvd->bv_totalcount++;
+ brt_vdev_entcount_inc(brtvd, idx);
+ brtvd->bv_entcount_dirty = TRUE;
+ idx = idx / BRT_BLOCKSIZE / 8;
+ BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+ if (zfs_flags & ZFS_DEBUG_BRT)
+ brt_vdev_dump(brtvd);
+#endif
+}
+
+static void
+brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+ uint64_t dsize)
+{
+ uint64_t idx;
+
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ ASSERT(brtvd != NULL);
+ ASSERT(brtvd->bv_entcount != NULL);
+
+ brt->brt_savedspace -= dsize;
+ brtvd->bv_savedspace -= dsize;
+ brtvd->bv_meta_dirty = TRUE;
+
+ if (bre->bre_refcount > 0) {
+ return;
+ }
+
+ brt->brt_usedspace -= dsize;
+ brtvd->bv_usedspace -= dsize;
+
+ idx = bre->bre_offset / brt->brt_rangesize;
+ ASSERT3U(idx, <, brtvd->bv_size);
+
+ ASSERT(brtvd->bv_totalcount > 0);
+ brtvd->bv_totalcount--;
+ brt_vdev_entcount_dec(brtvd, idx);
+ brtvd->bv_entcount_dirty = TRUE;
+ idx = idx / BRT_BLOCKSIZE / 8;
+ BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+ if (zfs_flags & ZFS_DEBUG_BRT)
+ brt_vdev_dump(brtvd);
+#endif
+}
+
+static void
+brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+ brt_vdev_phys_t *bvphys;
+
+ ASSERT(brtvd->bv_meta_dirty);
+ ASSERT(brtvd->bv_mos_brtvdev != 0);
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+
+ if (brtvd->bv_entcount_dirty) {
+ /*
+ * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
+ */
+ dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+ brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
+ brtvd->bv_entcount, tx);
+ memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
+ brtvd->bv_entcount_dirty = FALSE;
+ }
+
+ dmu_buf_will_dirty(db, tx);
+ bvphys = db->db_data;
+ bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
+ bvphys->bvp_size = brtvd->bv_size;
+ if (brtvd->bv_need_byteswap) {
+ bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
+ } else {
+ bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
+ }
+ bvphys->bvp_totalcount = brtvd->bv_totalcount;
+ bvphys->bvp_rangesize = brt->brt_rangesize;
+ bvphys->bvp_usedspace = brtvd->bv_usedspace;
+ bvphys->bvp_savedspace = brtvd->bv_savedspace;
+ dmu_buf_rele(db, FTAG);
+
+ brtvd->bv_meta_dirty = FALSE;
+}
+
+static void
+brt_vdevs_alloc(brt_t *brt, boolean_t load)
+{
+ brt_vdev_t *brtvd;
+ uint64_t vdevid;
+
+ brt_wlock(brt);
+
+ brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
+
+ if (load) {
+ for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+ brtvd = &brt->brt_vdevs[vdevid];
+ ASSERT(brtvd->bv_entcount == NULL);
+
+ brt_vdev_load(brt, brtvd);
+ }
+ }
+
+ if (brt->brt_rangesize == 0) {
+ brt->brt_rangesize = BRT_RANGESIZE;
+ }
+
+ brt_unlock(brt);
+}
+
+static void
+brt_vdevs_free(brt_t *brt)
+{
+ brt_vdev_t *brtvd;
+ uint64_t vdevid;
+
+ brt_wlock(brt);
+
+ for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+ brtvd = &brt->brt_vdevs[vdevid];
+ if (brtvd->bv_initiated)
+ brt_vdev_dealloc(brt, brtvd);
+ }
+ kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
+
+ brt_unlock(brt);
+}
+
+static void
+brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
+{
+
+ bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
+ bre->bre_refcount = 0;
+
+ *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
+}
+
+static int
+brt_entry_compare(const void *x1, const void *x2)
+{
+ const brt_entry_t *bre1 = x1;
+ const brt_entry_t *bre2 = x2;
+
+ return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
+}
+
+static int
+brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
+{
+ uint64_t mos_entries;
+ int error;
+
+ ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+ if (!brt_vdev_lookup(brt, brtvd, bre))
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * Remember mos_entries object number. After we reacquire the BRT lock,
+ * the brtvd pointer may be invalid.
+ */
+ mos_entries = brtvd->bv_mos_entries;
+ if (mos_entries == 0)
+ return (SET_ERROR(ENOENT));
+
+ brt_unlock(brt);
+
+ error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+ BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
+
+ brt_wlock(brt);
+
+ return (error);
+}
+
+static void
+brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
+{
+ brt_vdev_t *brtvd;
+ uint64_t mos_entries = 0;
+
+ brt_rlock(brt);
+ brtvd = brt_vdev(brt, vdevid);
+ if (brtvd != NULL)
+ mos_entries = brtvd->bv_mos_entries;
+ brt_unlock(brt);
+
+ if (mos_entries == 0)
+ return;
+
+ (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
+ (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
+}
+
+/*
+ * Return TRUE if we _can_ have BRT entry for this bp. It might be false
+ * positive, but gives us quick answer if we should look into BRT, which
+ * may require reads and thus will be more expensive.
+ */
+boolean_t
+brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
+{
+ brt_t *brt = spa->spa_brt;
+ brt_vdev_t *brtvd;
+ brt_entry_t bre_search;
+ boolean_t mayexists = FALSE;
+ uint64_t vdevid;
+
+ brt_entry_fill(bp, &bre_search, &vdevid);
+
+ brt_rlock(brt);
+
+ brtvd = brt_vdev(brt, vdevid);
+ if (brtvd != NULL && brtvd->bv_initiated) {
+ if (!avl_is_empty(&brtvd->bv_tree) ||
+ brt_vdev_lookup(brt, brtvd, &bre_search)) {
+ mayexists = TRUE;
+ }
+ }
+
+ brt_unlock(brt);
+
+ return (mayexists);
+}
+
+uint64_t
+brt_get_dspace(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return (0);
+
+ return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_used(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return (0);
+
+ return (brt->brt_usedspace);
+}
+
+uint64_t
+brt_get_saved(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return (0);
+
+ return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_ratio(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt->brt_usedspace == 0)
+ return (100);
+
+ return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
+ brt->brt_usedspace);
+}
+
+static int
+brt_kstats_update(kstat_t *ksp, int rw)
+{
+ brt_stats_t *bs = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ bs->brt_addref_entry_in_memory.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_in_memory);
+ bs->brt_addref_entry_not_on_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
+ bs->brt_addref_entry_on_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_on_disk);
+ bs->brt_addref_entry_read_lost_race.value.ui64 =
+ wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
+ bs->brt_decref_entry_in_memory.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_in_memory);
+ bs->brt_decref_entry_loaded_from_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
+ bs->brt_decref_entry_not_in_memory.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
+ bs->brt_decref_entry_not_on_disk.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
+ bs->brt_decref_entry_read_lost_race.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
+ bs->brt_decref_entry_still_referenced.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
+ bs->brt_decref_free_data_later.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_free_data_later);
+ bs->brt_decref_free_data_now.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_free_data_now);
+ bs->brt_decref_no_entry.value.ui64 =
+ wmsum_value(&brt_sums.brt_decref_no_entry);
+
+ return (0);
+}
+
+static void
+brt_stat_init(void)
+{
+
+ wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
+ wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
+ wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
+ wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
+ wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
+ wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
+ wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
+ wmsum_init(&brt_sums.brt_decref_no_entry, 0);
+
+ brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (brt_ksp != NULL) {
+ brt_ksp->ks_data = &brt_stats;
+ brt_ksp->ks_update = brt_kstats_update;
+ kstat_install(brt_ksp);
+ }
+}
+
+static void
+brt_stat_fini(void)
+{
+ if (brt_ksp != NULL) {
+ kstat_delete(brt_ksp);
+ brt_ksp = NULL;
+ }
+
+ wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
+ wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
+ wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
+ wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
+ wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
+ wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
+ wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
+ wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
+ wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
+ wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
+ wmsum_fini(&brt_sums.brt_decref_free_data_later);
+ wmsum_fini(&brt_sums.brt_decref_free_data_now);
+ wmsum_fini(&brt_sums.brt_decref_no_entry);
+}
+
+void
+brt_init(void)
+{
+ brt_entry_cache = kmem_cache_create("brt_entry_cache",
+ sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
+ sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ brt_stat_init();
+}
+
+void
+brt_fini(void)
+{
+ brt_stat_fini();
+
+ kmem_cache_destroy(brt_entry_cache);
+ kmem_cache_destroy(brt_pending_entry_cache);
+}
+
+static brt_entry_t *
+brt_entry_alloc(const brt_entry_t *bre_init)
+{
+ brt_entry_t *bre;
+
+ bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
+ bre->bre_offset = bre_init->bre_offset;
+ bre->bre_refcount = bre_init->bre_refcount;
+
+ return (bre);
+}
+
+static void
+brt_entry_free(brt_entry_t *bre)
+{
+
+ kmem_cache_free(brt_entry_cache, bre);
+}
+
+static void
+brt_entry_addref(brt_t *brt, const blkptr_t *bp)
+{
+ brt_vdev_t *brtvd;
+ brt_entry_t *bre, *racebre;
+ brt_entry_t bre_search;
+ avl_index_t where;
+ uint64_t vdevid;
+ int error;
+
+ ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
+
+ brt_entry_fill(bp, &bre_search, &vdevid);
+
+ brt_wlock(brt);
+
+ brtvd = brt_vdev(brt, vdevid);
+ if (brtvd == NULL) {
+ ASSERT3U(vdevid, >=, brt->brt_nvdevs);
+
+ /* New VDEV was added. */
+ brt_vdevs_expand(brt, vdevid + 1);
+ brtvd = brt_vdev(brt, vdevid);
+ }
+ ASSERT(brtvd != NULL);
+ if (!brtvd->bv_initiated)
+ brt_vdev_realloc(brt, brtvd);
+
+ bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+ if (bre != NULL) {
+ BRTSTAT_BUMP(brt_addref_entry_in_memory);
+ } else {
+ /*
+ * brt_entry_lookup() may drop the BRT (read) lock and
+ * reacquire it (write).
+ */
+ error = brt_entry_lookup(brt, brtvd, &bre_search);
+ /* bre_search now contains correct bre_refcount */
+ ASSERT(error == 0 || error == ENOENT);
+ if (error == 0)
+ BRTSTAT_BUMP(brt_addref_entry_on_disk);
+ else
+ BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
+ /*
+ * When the BRT lock was dropped, brt_vdevs[] may have been
+ * expanded and reallocated, we need to update brtvd's pointer.
+ */
+ brtvd = brt_vdev(brt, vdevid);
+ ASSERT(brtvd != NULL);
+
+ racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+ if (racebre == NULL) {
+ bre = brt_entry_alloc(&bre_search);
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ avl_insert(&brtvd->bv_tree, bre, where);
+ brt->brt_nentries++;
+ } else {
+ /*
+ * The entry was added when the BRT lock was dropped in
+ * brt_entry_lookup().
+ */
+ BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
+ bre = racebre;
+ }
+ }
+ bre->bre_refcount++;
+ brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+ brt_unlock(brt);
+}
+
+/* Return TRUE if block should be freed immediately. */
+boolean_t
+brt_entry_decref(spa_t *spa, const blkptr_t *bp)
+{
+ brt_t *brt = spa->spa_brt;
+ brt_vdev_t *brtvd;
+ brt_entry_t *bre, *racebre;
+ brt_entry_t bre_search;
+ avl_index_t where;
+ uint64_t vdevid;
+ int error;
+
+ brt_entry_fill(bp, &bre_search, &vdevid);
+
+ brt_wlock(brt);
+
+ brtvd = brt_vdev(brt, vdevid);
+ ASSERT(brtvd != NULL);
+
+ bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+ if (bre != NULL) {
+ BRTSTAT_BUMP(brt_decref_entry_in_memory);
+ goto out;
+ } else {
+ BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
+ }
+
+ /*
+ * brt_entry_lookup() may drop the BRT lock and reacquire it.
+ */
+ error = brt_entry_lookup(brt, brtvd, &bre_search);
+ /* bre_search now contains correct bre_refcount */
+ ASSERT(error == 0 || error == ENOENT);
+ /*
+ * When the BRT lock was dropped, brt_vdevs[] may have been expanded
+ * and reallocated, we need to update brtvd's pointer.
+ */
+ brtvd = brt_vdev(brt, vdevid);
+ ASSERT(brtvd != NULL);
+
+ if (error == ENOENT) {
+ BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
+ bre = NULL;
+ goto out;
+ }
+
+ racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+ if (racebre != NULL) {
+ /*
+ * The entry was added when the BRT lock was dropped in
+ * brt_entry_lookup().
+ */
+ BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
+ bre = racebre;
+ goto out;
+ }
+
+ BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
+ bre = brt_entry_alloc(&bre_search);
+ ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+ avl_insert(&brtvd->bv_tree, bre, where);
+ brt->brt_nentries++;
+
+out:
+ if (bre == NULL) {
+ /*
+ * This is a free of a regular (not cloned) block.
+ */
+ brt_unlock(brt);
+ BRTSTAT_BUMP(brt_decref_no_entry);
+ return (B_TRUE);
+ }
+ if (bre->bre_refcount == 0) {
+ brt_unlock(brt);
+ BRTSTAT_BUMP(brt_decref_free_data_now);
+ return (B_TRUE);
+ }
+
+ ASSERT(bre->bre_refcount > 0);
+ bre->bre_refcount--;
+ if (bre->bre_refcount == 0)
+ BRTSTAT_BUMP(brt_decref_free_data_later);
+ else
+ BRTSTAT_BUMP(brt_decref_entry_still_referenced);
+ brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+ brt_unlock(brt);
+
+ return (B_FALSE);
+}
+
+uint64_t
+brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
+{
+ brt_t *brt = spa->spa_brt;
+ brt_vdev_t *brtvd;
+ brt_entry_t bre_search, *bre;
+ uint64_t vdevid, refcnt;
+ int error;
+
+ brt_entry_fill(bp, &bre_search, &vdevid);
+
+ brt_rlock(brt);
+
+ brtvd = brt_vdev(brt, vdevid);
+ ASSERT(brtvd != NULL);
+
+ bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+ if (bre == NULL) {
+ error = brt_entry_lookup(brt, brtvd, &bre_search);
+ ASSERT(error == 0 || error == ENOENT);
+ if (error == ENOENT)
+ refcnt = 0;
+ else
+ refcnt = bre_search.bre_refcount;
+ } else
+ refcnt = bre->bre_refcount;
+
+ brt_unlock(brt);
+ return (refcnt);
+}
+
+static void
+brt_prefetch(brt_t *brt, const blkptr_t *bp)
+{
+ brt_entry_t bre;
+ uint64_t vdevid;
+
+ ASSERT(bp != NULL);
+
+ if (!brt_zap_prefetch)
+ return;
+
+ brt_entry_fill(bp, &bre, &vdevid);
+
+ brt_entry_prefetch(brt, vdevid, &bre);
+}
+
+static int
+brt_pending_entry_compare(const void *x1, const void *x2)
+{
+ const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
+ const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
+ int cmp;
+
+ cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+ DVA_GET_VDEV(&bp2->blk_dva[0]));
+ if (cmp == 0) {
+ cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+ DVA_GET_OFFSET(&bp2->blk_dva[0]));
+ if (unlikely(cmp == 0)) {
+ cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
+ }
+ }
+
+ return (cmp);
+}
+
+void
+brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ brt_t *brt;
+ avl_tree_t *pending_tree;
+ kmutex_t *pending_lock;
+ brt_pending_entry_t *bpe, *newbpe;
+ avl_index_t where;
+ uint64_t txg;
+
+ brt = spa->spa_brt;
+ txg = dmu_tx_get_txg(tx);
+ ASSERT3U(txg, !=, 0);
+ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+ pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+ newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
+ newbpe->bpe_bp = *bp;
+ newbpe->bpe_count = 1;
+
+ mutex_enter(pending_lock);
+
+ bpe = avl_find(pending_tree, newbpe, &where);
+ if (bpe == NULL) {
+ avl_insert(pending_tree, newbpe, where);
+ newbpe = NULL;
+ } else {
+ bpe->bpe_count++;
+ }
+
+ mutex_exit(pending_lock);
+
+ if (newbpe != NULL) {
+ ASSERT(bpe != NULL);
+ ASSERT(bpe != newbpe);
+ kmem_cache_free(brt_pending_entry_cache, newbpe);
+ } else {
+ ASSERT(bpe == NULL);
+
+ /* Prefetch BRT entry for the syncing context. */
+ brt_prefetch(brt, bp);
+ }
+}
+
+void
+brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ brt_t *brt;
+ avl_tree_t *pending_tree;
+ kmutex_t *pending_lock;
+ brt_pending_entry_t *bpe, bpe_search;
+ uint64_t txg;
+
+ brt = spa->spa_brt;
+ txg = dmu_tx_get_txg(tx);
+ ASSERT3U(txg, !=, 0);
+ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+ pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+ bpe_search.bpe_bp = *bp;
+
+ mutex_enter(pending_lock);
+
+ bpe = avl_find(pending_tree, &bpe_search, NULL);
+ /* I believe we should always find bpe when this function is called. */
+ if (bpe != NULL) {
+ ASSERT(bpe->bpe_count > 0);
+
+ bpe->bpe_count--;
+ if (bpe->bpe_count == 0) {
+ avl_remove(pending_tree, bpe);
+ kmem_cache_free(brt_pending_entry_cache, bpe);
+ }
+ }
+
+ mutex_exit(pending_lock);
+}
+
+void
+brt_pending_apply(spa_t *spa, uint64_t txg)
+{
+ brt_t *brt = spa->spa_brt;
+ brt_pending_entry_t *bpe;
+ avl_tree_t *pending_tree;
+ void *c;
+
+ ASSERT3U(txg, !=, 0);
+
+ /*
+ * We are in syncing context, so no other brt_pending_tree accesses
+ * are possible for the TXG. Don't need to acquire brt_pending_lock.
+ */
+ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+
+ c = NULL;
+ while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
+ boolean_t added_to_ddt;
+
+ for (int i = 0; i < bpe->bpe_count; i++) {
+ /*
+ * If the block has DEDUP bit set, it means that it
+ * already exists in the DEDUP table, so we can just
+ * use that instead of creating new entry in
+ * the BRT table.
+ */
+ if (BP_GET_DEDUP(&bpe->bpe_bp)) {
+ added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
+ } else {
+ added_to_ddt = B_FALSE;
+ }
+ if (!added_to_ddt)
+ brt_entry_addref(brt, &bpe->bpe_bp);
+ }
+
+ kmem_cache_free(brt_pending_entry_cache, bpe);
+ }
+}
+
+static void
+brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
+{
+ if (bre->bre_refcount == 0) {
+ int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
+ BRT_KEY_WORDS, tx);
+ VERIFY(error == 0 || error == ENOENT);
+ } else {
+ VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
+ BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
+ &bre->bre_refcount, tx));
+ }
+}
+
+static void
+brt_sync_table(brt_t *brt, dmu_tx_t *tx)
+{
+ brt_vdev_t *brtvd;
+ brt_entry_t *bre;
+ dnode_t *dn;
+ uint64_t vdevid;
+ void *c;
+
+ brt_wlock(brt);
+
+ for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+ brtvd = &brt->brt_vdevs[vdevid];
+
+ if (!brtvd->bv_initiated)
+ continue;
+
+ if (!brtvd->bv_meta_dirty) {
+ ASSERT(!brtvd->bv_entcount_dirty);
+ ASSERT0(avl_numnodes(&brtvd->bv_tree));
+ continue;
+ }
+
+ ASSERT(!brtvd->bv_entcount_dirty ||
+ avl_numnodes(&brtvd->bv_tree) != 0);
+
+ if (brtvd->bv_mos_brtvdev == 0)
+ brt_vdev_create(brt, brtvd, tx);
+
+ VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
+ FTAG, &dn));
+
+ c = NULL;
+ while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
+ brt_sync_entry(dn, bre, tx);
+ brt_entry_free(bre);
+ ASSERT(brt->brt_nentries > 0);
+ brt->brt_nentries--;
+ }
+
+ dnode_rele(dn, FTAG);
+
+ brt_vdev_sync(brt, brtvd, tx);
+
+ if (brtvd->bv_totalcount == 0)
+ brt_vdev_destroy(brt, brtvd, tx);
+ }
+
+ ASSERT0(brt->brt_nentries);
+
+ brt_unlock(brt);
+}
+
+void
+brt_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ brt_t *brt;
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ brt = spa->spa_brt;
+ brt_rlock(brt);
+ if (brt->brt_nentries == 0) {
+ /* No changes. */
+ brt_unlock(brt);
+ return;
+ }
+ brt_unlock(brt);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ brt_sync_table(brt, tx);
+
+ dmu_tx_commit(tx);
+}
+
+static void
+brt_table_alloc(brt_t *brt)
+{
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ avl_create(&brt->brt_pending_tree[i],
+ brt_pending_entry_compare,
+ sizeof (brt_pending_entry_t),
+ offsetof(brt_pending_entry_t, bpe_node));
+ mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
+ NULL);
+ }
+}
+
+static void
+brt_table_free(brt_t *brt)
+{
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
+
+ avl_destroy(&brt->brt_pending_tree[i]);
+ mutex_destroy(&brt->brt_pending_lock[i]);
+ }
+}
+
+static void
+brt_alloc(spa_t *spa)
+{
+ brt_t *brt;
+
+ ASSERT(spa->spa_brt == NULL);
+
+ brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
+ rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
+ brt->brt_spa = spa;
+ brt->brt_rangesize = 0;
+ brt->brt_nentries = 0;
+ brt->brt_vdevs = NULL;
+ brt->brt_nvdevs = 0;
+ brt_table_alloc(brt);
+
+ spa->spa_brt = brt;
+}
+
+void
+brt_create(spa_t *spa)
+{
+
+ brt_alloc(spa);
+ brt_vdevs_alloc(spa->spa_brt, B_FALSE);
+}
+
+int
+brt_load(spa_t *spa)
+{
+
+ brt_alloc(spa);
+ brt_vdevs_alloc(spa->spa_brt, B_TRUE);
+
+ return (0);
+}
+
+void
+brt_unload(spa_t *spa)
+{
+ brt_t *brt = spa->spa_brt;
+
+ if (brt == NULL)
+ return;
+
+ brt_vdevs_free(brt);
+ brt_table_free(brt);
+ rw_destroy(&brt->brt_lock);
+ kmem_free(brt, sizeof (*brt));
+ spa->spa_brt = NULL;
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
+ "Enable prefetching of BRT ZAP entries");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
+ "BRT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
+ "BRT ZAP indirect blockshift");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c
index 57b9dbbb2b50..9c52083603f1 100644
--- a/sys/contrib/openzfs/module/zfs/btree.c
+++ b/sys/contrib/openzfs/module/zfs/btree.c
@@ -53,18 +53,30 @@ kmem_cache_t *zfs_btree_leaf_cache;
* (while the asymptotic complexity of the other steps is the same, the
* importance of the constant factors cannot be denied).
*/
-int zfs_btree_verify_intensity = 0;
+uint_t zfs_btree_verify_intensity = 0;
/*
- * A convenience function to silence warnings from memmove's return value and
- * change argument order to src, dest.
+ * Convenience functions to silence warnings from memcpy/memmove's
+ * return values and change argument order to src, dest.
*/
static void
+bcpy(const void *src, void *dest, size_t size)
+{
+ (void) memcpy(dest, src, size);
+}
+
+static void
bmov(const void *src, void *dest, size_t size)
{
(void) memmove(dest, src, size);
}
+static boolean_t
+zfs_btree_is_core(struct zfs_btree_hdr *hdr)
+{
+ return (hdr->bth_first == -1);
+}
+
#ifdef _ILP32
#define BTREE_POISON 0xabadb10c
#else
@@ -76,59 +88,74 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
{
#ifdef ZFS_DEBUG
size_t size = tree->bt_elem_size;
- if (!hdr->bth_core) {
- zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
- (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f,
- BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) -
- hdr->bth_count * size);
- } else {
+ if (zfs_btree_is_core(hdr)) {
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
- for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+ for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
+ i++) {
node->btc_children[i] =
(zfs_btree_hdr_t *)BTREE_POISON;
}
(void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
(BTREE_CORE_ELEMS - hdr->bth_count) * size);
+ } else {
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
+ (void) memset(leaf->btl_elems +
+ (hdr->bth_first + hdr->bth_count) * size, 0x0f,
+ tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
+ (hdr->bth_first + hdr->bth_count) * size);
}
#endif
}
static inline void
zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
- uint64_t offset)
+ uint32_t idx, uint32_t count)
{
#ifdef ZFS_DEBUG
size_t size = tree->bt_elem_size;
- ASSERT3U(offset, >=, hdr->bth_count);
- if (!hdr->bth_core) {
- zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
- (void) memset(leaf->btl_elems + offset * size, 0x0f, size);
- } else {
+ if (zfs_btree_is_core(hdr)) {
+ ASSERT3U(idx, >=, hdr->bth_count);
+ ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
+ ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
- node->btc_children[offset + 1] =
- (zfs_btree_hdr_t *)BTREE_POISON;
- (void) memset(node->btc_elems + offset * size, 0x0f, size);
+ for (uint32_t i = 1; i <= count; i++) {
+ node->btc_children[idx + i] =
+ (zfs_btree_hdr_t *)BTREE_POISON;
+ }
+ (void) memset(node->btc_elems + idx * size, 0x0f, count * size);
+ } else {
+ ASSERT3U(idx, <=, tree->bt_leaf_cap);
+ ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
+ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+ (void) memset(leaf->btl_elems +
+ (hdr->bth_first + idx) * size, 0x0f, count * size);
}
#endif
}
static inline void
zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
- uint64_t offset)
+ uint32_t idx)
{
#ifdef ZFS_DEBUG
size_t size = tree->bt_elem_size;
- uint8_t eval = 0x0f;
- if (hdr->bth_core) {
+ if (zfs_btree_is_core(hdr)) {
+ ASSERT3U(idx, <, BTREE_CORE_ELEMS);
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
- VERIFY3P(node->btc_children[offset + 1], ==, cval);
- for (int i = 0; i < size; i++)
- VERIFY3U(node->btc_elems[offset * size + i], ==, eval);
+ VERIFY3P(node->btc_children[idx + 1], ==, cval);
+ for (size_t i = 0; i < size; i++)
+ VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
} else {
+ ASSERT3U(idx, <, tree->bt_leaf_cap);
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
- for (int i = 0; i < size; i++)
- VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval);
+ if (idx >= tree->bt_leaf_cap - hdr->bth_first)
+ return;
+ for (size_t i = 0; i < size; i++) {
+ VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
+ * size + i], ==, 0x0f);
+ }
}
#endif
}
@@ -137,8 +164,7 @@ void
zfs_btree_init(void)
{
zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
- BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL,
- NULL, 0);
+ BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
@@ -147,21 +173,52 @@ zfs_btree_fini(void)
kmem_cache_destroy(zfs_btree_leaf_cache);
}
+static void *
+zfs_btree_leaf_alloc(zfs_btree_t *tree)
+{
+ if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
+ return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
+ else
+ return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
+}
+
+static void
+zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
+{
+ if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
+ return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
+ else
+ return (kmem_free(ptr, tree->bt_leaf_size));
+}
+
void
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
- size_t size)
+ bt_find_in_buf_f bt_find_in_buf, size_t size)
{
- /*
- * We need a minimmum of 4 elements so that when we split a node we
- * always have at least two elements in each node. This simplifies the
- * logic in zfs_btree_bulk_finish, since it means the last leaf will
- * always have a left sibling to share with (unless it's the root).
- */
- ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4);
+ zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
+ BTREE_LEAF_SIZE);
+}
+
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
+ const void *value, zfs_btree_index_t *where);
+
+void
+zfs_btree_create_custom(zfs_btree_t *tree,
+ int (*compar) (const void *, const void *),
+ bt_find_in_buf_f bt_find_in_buf,
+ size_t size, size_t lsize)
+{
+ size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
- bzero(tree, sizeof (*tree));
+ ASSERT3U(size, <=, esize / 2);
+ memset(tree, 0, sizeof (*tree));
tree->bt_compar = compar;
+ tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
+ zfs_btree_find_in_buf : bt_find_in_buf;
tree->bt_elem_size = size;
+ tree->bt_leaf_size = lsize;
+ tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
tree->bt_height = -1;
tree->bt_bulk = NULL;
}
@@ -170,21 +227,20 @@ zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
* Find value in the array of elements provided. Uses a simple binary search.
*/
static void *
-zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems,
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
const void *value, zfs_btree_index_t *where)
{
- uint64_t max = nelems;
- uint64_t min = 0;
+ uint32_t max = nelems;
+ uint32_t min = 0;
while (max > min) {
- uint64_t idx = (min + max) / 2;
+ uint32_t idx = (min + max) / 2;
uint8_t *cur = buf + idx * tree->bt_elem_size;
int comp = tree->bt_compar(cur, value);
- if (comp == -1) {
+ if (comp < 0) {
min = idx + 1;
- } else if (comp == 1) {
+ } else if (comp > 0) {
max = idx;
} else {
- ASSERT0(comp);
where->bti_offset = idx;
where->bti_before = B_FALSE;
return (cur);
@@ -219,12 +275,13 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
* bulk-insert mode are to insert new elements.
*/
zfs_btree_index_t idx;
+ size_t size = tree->bt_elem_size;
if (tree->bt_bulk != NULL) {
zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
- int compar = tree->bt_compar(last_leaf->btl_elems +
- ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size),
- value);
- if (compar < 0) {
+ int comp = tree->bt_compar(last_leaf->btl_elems +
+ (last_leaf->btl_hdr.bth_first +
+ last_leaf->btl_hdr.bth_count - 1) * size, value);
+ if (comp < 0) {
/*
* If what they're looking for is after the last
* element, it's not in the tree.
@@ -236,7 +293,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
where->bti_before = B_TRUE;
}
return (NULL);
- } else if (compar == 0) {
+ } else if (comp == 0) {
if (where != NULL) {
where->bti_node = (zfs_btree_hdr_t *)last_leaf;
where->bti_offset =
@@ -244,18 +301,20 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
where->bti_before = B_FALSE;
}
return (last_leaf->btl_elems +
- ((last_leaf->btl_hdr.bth_count - 1) *
- tree->bt_elem_size));
+ (last_leaf->btl_hdr.bth_first +
+ last_leaf->btl_hdr.bth_count - 1) * size);
}
- if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) {
+ if (tree->bt_compar(last_leaf->btl_elems +
+ last_leaf->btl_hdr.bth_first * size, value) <= 0) {
/*
* If what they're looking for is after the first
* element in the last leaf, it's in the last leaf or
* it's not in the tree.
*/
- void *d = zfs_btree_find_in_buf(tree,
- last_leaf->btl_elems, last_leaf->btl_hdr.bth_count,
- value, &idx);
+ void *d = tree->bt_find_in_buf(tree,
+ last_leaf->btl_elems +
+ last_leaf->btl_hdr.bth_first * size,
+ last_leaf->btl_hdr.bth_count, value, &idx);
if (where != NULL) {
idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
@@ -266,8 +325,8 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
}
zfs_btree_core_t *node = NULL;
- uint64_t child = 0;
- uint64_t depth = 0;
+ uint32_t child = 0;
+ uint32_t depth = 0;
/*
* Iterate down the tree, finding which child the value should be in
@@ -276,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
ASSERT3P(node, !=, NULL);
- void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+ void *d = tree->bt_find_in_buf(tree, node->btc_elems,
node->btc_hdr.bth_count, value, &idx);
EQUIV(d != NULL, !idx.bti_before);
if (d != NULL) {
@@ -296,7 +355,8 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
*/
zfs_btree_leaf_t *leaf = (depth == 0 ?
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
- void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems,
+ void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
+ leaf->btl_hdr.bth_first * size,
leaf->btl_hdr.bth_count, value, &idx);
if (where != NULL) {
@@ -366,24 +426,23 @@ enum bt_shift_direction {
* shift is determined by shape. The direction is determined by dir.
*/
static inline void
-bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
- uint64_t count, uint64_t off, enum bt_shift_shape shape,
+bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
+ uint32_t count, uint32_t off, enum bt_shift_shape shape,
enum bt_shift_direction dir)
{
size_t size = tree->bt_elem_size;
- ASSERT(node->btc_hdr.bth_core);
+ ASSERT(zfs_btree_is_core(&node->btc_hdr));
uint8_t *e_start = node->btc_elems + idx * size;
- int sign = (dir == BSD_LEFT ? -1 : +1);
- uint8_t *e_out = e_start + sign * off * size;
- uint64_t e_count = count;
- bmov(e_start, e_out, e_count * size);
+ uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
+ e_start + off * size);
+ bmov(e_start, e_out, count * size);
zfs_btree_hdr_t **c_start = node->btc_children + idx +
(shape == BSS_TRAPEZOID ? 0 : 1);
zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
c_start + off);
- uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+ uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
bmov(c_start, c_out, c_count * sizeof (*c_start));
}
@@ -394,8 +453,8 @@ bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
* false if it is a parallelogram.
*/
static inline void
-bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
- uint64_t count, enum bt_shift_shape shape)
+bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
+ uint32_t count, enum bt_shift_shape shape)
{
bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
}
@@ -405,8 +464,8 @@ bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
* Starts with elements[idx] and children[idx] and one more child than element.
*/
static inline void
-bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
- uint64_t count, enum bt_shift_shape shape)
+bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
+ uint32_t count, enum bt_shift_shape shape)
{
bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
}
@@ -417,30 +476,78 @@ bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
* is determined by left.
*/
static inline void
-bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx,
- uint64_t count, uint64_t off, enum bt_shift_direction dir)
+bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
+ uint32_t count, uint32_t off, enum bt_shift_direction dir)
{
size_t size = tree->bt_elem_size;
- ASSERT(!node->btl_hdr.bth_core);
+ zfs_btree_hdr_t *hdr = &node->btl_hdr;
+ ASSERT(!zfs_btree_is_core(hdr));
- uint8_t *start = node->btl_elems + idx * size;
- int sign = (dir == BSD_LEFT ? -1 : +1);
- uint8_t *out = start + sign * off * size;
+ if (count == 0)
+ return;
+ uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
+ uint8_t *out = (dir == BSD_LEFT ? start - off * size :
+ start + off * size);
bmov(start, out, count * size);
}
-static inline void
-bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
- uint64_t count)
+/*
+ * Grow leaf for n new elements before idx.
+ */
+static void
+bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
+ uint32_t n)
{
- bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT);
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ ASSERT(!zfs_btree_is_core(hdr));
+ ASSERT3U(idx, <=, hdr->bth_count);
+ uint32_t capacity = tree->bt_leaf_cap;
+ ASSERT3U(hdr->bth_count + n, <=, capacity);
+ boolean_t cl = (hdr->bth_first >= n);
+ boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
+
+ if (cl && (!cr || idx <= hdr->bth_count / 2)) {
+ /* Grow left. */
+ hdr->bth_first -= n;
+ bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
+ } else if (cr) {
+ /* Grow right. */
+ bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
+ BSD_RIGHT);
+ } else {
+ /* Grow both ways. */
+ uint32_t fn = hdr->bth_first -
+ (capacity - (hdr->bth_count + n)) / 2;
+ hdr->bth_first -= fn;
+ bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
+ bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
+ n - fn, BSD_RIGHT);
+ }
+ hdr->bth_count += n;
}
-static inline void
-bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
- uint64_t count)
+/*
+ * Shrink leaf for count elements starting from idx.
+ */
+static void
+bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
+ uint32_t n)
{
- bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT);
+ zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+ ASSERT(!zfs_btree_is_core(hdr));
+ ASSERT3U(idx, <=, hdr->bth_count);
+ ASSERT3U(idx + n, <=, hdr->bth_count);
+
+ if (idx <= (hdr->bth_count - n) / 2) {
+ bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
+ zfs_btree_poison_node_at(tree, hdr, 0, n);
+ hdr->bth_first += n;
+ } else {
+ bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
+ BSD_LEFT);
+ zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
+ }
+ hdr->bth_count -= n;
}
/*
@@ -448,32 +555,33 @@ bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
* parameter behaves the same as it does in the shift logic.
*/
static inline void
-bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx,
- uint64_t count, zfs_btree_core_t *dest, uint64_t didx,
+bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
+ uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
enum bt_shift_shape shape)
{
size_t size = tree->bt_elem_size;
- ASSERT(source->btc_hdr.bth_core);
- ASSERT(dest->btc_hdr.bth_core);
+ ASSERT(zfs_btree_is_core(&source->btc_hdr));
+ ASSERT(zfs_btree_is_core(&dest->btc_hdr));
- bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
+ bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
count * size);
- uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
- bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
+ uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+ bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
c_count * sizeof (*source->btc_children));
}
static inline void
-bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
- uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx)
+bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
+ uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
{
size_t size = tree->bt_elem_size;
- ASSERT(!source->btl_hdr.bth_core);
- ASSERT(!dest->btl_hdr.bth_core);
+ ASSERT(!zfs_btree_is_core(&source->btl_hdr));
+ ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
- bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size,
+ bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
+ dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
count * size);
}
@@ -482,30 +590,31 @@ bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
* put its location in where if non-null.
*/
static void *
-zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where)
+zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+ zfs_btree_index_t *where)
{
zfs_btree_hdr_t *node;
- for (node = hdr; node->bth_core; node =
- ((zfs_btree_core_t *)node)->btc_children[0])
+ for (node = hdr; zfs_btree_is_core(node);
+ node = ((zfs_btree_core_t *)node)->btc_children[0])
;
- ASSERT(!node->bth_core);
+ ASSERT(!zfs_btree_is_core(node));
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
if (where != NULL) {
where->bti_node = node;
where->bti_offset = 0;
where->bti_before = B_FALSE;
}
- return (&leaf->btl_elems[0]);
+ return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
}
/* Insert an element and a child into a core node at the given offset. */
static void
zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
- uint64_t offset, zfs_btree_hdr_t *new_node, void *buf)
+ uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
{
- uint64_t size = tree->bt_elem_size;
+ size_t size = tree->bt_elem_size;
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
ASSERT3P(par_hdr, ==, new_node->bth_parent);
ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
@@ -515,13 +624,13 @@ zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
par_hdr->bth_count);
}
/* Shift existing elements and children */
- uint64_t count = par_hdr->bth_count - offset;
+ uint32_t count = par_hdr->bth_count - offset;
bt_shift_core_right(tree, parent, offset, count,
BSS_PARALLELOGRAM);
/* Insert new values */
parent->btc_children[offset + 1] = new_node;
- bmov(buf, parent->btc_elems + offset * size, size);
+ bcpy(buf, parent->btc_elems + offset * size, size);
par_hdr->bth_count++;
}
@@ -534,9 +643,8 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
zfs_btree_hdr_t *new_node, void *buf)
{
ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
- uint64_t size = tree->bt_elem_size;
+ size_t size = tree->bt_elem_size;
zfs_btree_core_t *parent = old_node->bth_parent;
- zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
/*
* If this is the root node we were splitting, we create a new root
@@ -550,13 +658,13 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
size, KM_SLEEP);
zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
new_root_hdr->bth_parent = NULL;
- new_root_hdr->bth_core = B_TRUE;
+ new_root_hdr->bth_first = -1;
new_root_hdr->bth_count = 1;
old_node->bth_parent = new_node->bth_parent = new_root;
new_root->btc_children[0] = old_node;
new_root->btc_children[1] = new_node;
- bmov(buf, new_root->btc_elems, size);
+ bcpy(buf, new_root->btc_elems, size);
tree->bt_height++;
tree->bt_root = new_root_hdr;
@@ -568,12 +676,13 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
* Since we have the new separator, binary search for where to put
* new_node.
*/
+ zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
zfs_btree_index_t idx;
- ASSERT(par_hdr->bth_core);
- VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ ASSERT(zfs_btree_is_core(par_hdr));
+ VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
par_hdr->bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
- uint64_t offset = idx.bti_offset;
+ uint32_t offset = idx.bti_offset;
ASSERT3U(offset, <=, par_hdr->bth_count);
ASSERT3P(parent->btc_children[offset], ==, old_node);
@@ -604,16 +713,16 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
* We do this in two stages: first we split into two nodes, and then we
* reuse our existing logic to insert the new element and child.
*/
- uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
+ uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
2 : 4)) - 1, 2);
- uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
+ uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
tree->bt_num_nodes++;
zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
BTREE_CORE_ELEMS * size, KM_SLEEP);
zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
new_par_hdr->bth_parent = par_hdr->bth_parent;
- new_par_hdr->bth_core = B_TRUE;
+ new_par_hdr->bth_first = -1;
new_par_hdr->bth_count = move_count;
zfs_btree_poison_node(tree, new_par_hdr);
@@ -624,7 +733,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
/* Store the new separator in a buffer. */
uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
- bmov(parent->btc_elems + keep_count * size, tmp_buf,
+ bcpy(parent->btc_elems + keep_count * size, tmp_buf,
size);
zfs_btree_poison_node(tree, par_hdr);
@@ -636,7 +745,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
/*
* Move the new separator to the existing buffer.
*/
- bmov(tmp_buf, buf, size);
+ bcpy(tmp_buf, buf, size);
} else if (offset > keep_count) {
/* Insert the new node into the right half */
new_node->bth_parent = new_parent;
@@ -646,7 +755,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
/*
* Move the new separator to the existing buffer.
*/
- bmov(tmp_buf, buf, size);
+ bcpy(tmp_buf, buf, size);
} else {
/*
* Move the new separator into the right half, and replace it
@@ -656,16 +765,16 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
bt_shift_core_right(tree, new_parent, 0, move_count,
BSS_TRAPEZOID);
new_parent->btc_children[0] = new_node;
- bmov(tmp_buf, new_parent->btc_elems, size);
+ bcpy(tmp_buf, new_parent->btc_elems, size);
new_par_hdr->bth_count++;
}
kmem_free(tmp_buf, size);
zfs_btree_poison_node(tree, par_hdr);
- for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++)
+ for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
new_parent->btc_children[i]->bth_parent = new_parent;
- for (int i = 0; i <= parent->btc_hdr.bth_count; i++)
+ for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
/*
@@ -679,34 +788,32 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
/* Insert an element into a leaf node at the given offset. */
static void
zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
- uint64_t idx, const void *value)
+ uint32_t idx, const void *value)
{
- uint64_t size = tree->bt_elem_size;
- uint8_t *start = leaf->btl_elems + (idx * size);
+ size_t size = tree->bt_elem_size;
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
- uint64_t capacity __maybe_unused = P2ALIGN((BTREE_LEAF_SIZE -
- sizeof (zfs_btree_hdr_t)) / size, 2);
- uint64_t count = leaf->btl_hdr.bth_count - idx;
- ASSERT3U(leaf->btl_hdr.bth_count, <, capacity);
+ ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
if (zfs_btree_verify_intensity >= 5) {
zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
leaf->btl_hdr.bth_count);
}
- bt_shift_leaf_right(tree, leaf, idx, count);
- bmov(value, start, size);
- hdr->bth_count++;
+ bt_grow_leaf(tree, leaf, idx, 1);
+ uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
+ bcpy(value, start, size);
}
+static void
+zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
+
/* Helper function for inserting a new value into leaf at the given index. */
static void
zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
- const void *value, uint64_t idx)
+ const void *value, uint32_t idx)
{
- uint64_t size = tree->bt_elem_size;
- uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
- sizeof (zfs_btree_hdr_t)) / size, 2);
+ size_t size = tree->bt_elem_size;
+ uint32_t capacity = tree->bt_leaf_cap;
/*
* If the leaf isn't full, shift the elements after idx and insert
@@ -731,32 +838,35 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
* In either case, we're left with one extra element. The leftover
* element will become the new dividing element between the two nodes.
*/
- uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) -
- 1, 2);
- uint64_t keep_count = capacity - move_count - 1;
- ASSERT3U(capacity - move_count, >=, 2);
+ uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
+ uint32_t keep_count = capacity - move_count - 1;
+ ASSERT3U(keep_count, >=, 1);
+ /* If we insert on left. move one more to keep leaves balanced. */
+ if (idx < keep_count) {
+ keep_count--;
+ move_count++;
+ }
tree->bt_num_nodes++;
- zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
- KM_SLEEP);
+ zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
- new_hdr->bth_core = B_FALSE;
+ new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
+ (idx >= keep_count && idx <= keep_count + move_count / 2);
new_hdr->bth_count = move_count;
zfs_btree_poison_node(tree, new_hdr);
- leaf->btl_hdr.bth_count = keep_count;
-
if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
tree->bt_bulk = new_leaf;
/* Copy the back part to the new leaf. */
- bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf,
- 0);
+ bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
/* We store the new separator in a buffer we control for simplicity. */
uint8_t *buf = kmem_alloc(size, KM_SLEEP);
- bmov(leaf->btl_elems + (keep_count * size), buf, size);
- zfs_btree_poison_node(tree, &leaf->btl_hdr);
+ bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
+ buf, size);
+
+ bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
if (idx < keep_count) {
/* Insert into the existing leaf. */
@@ -767,13 +877,11 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
1, value);
} else {
/*
- * Shift the elements in the new leaf to make room for the
- * separator, and use the new value as the new separator.
+ * Insert planned separator into the new leaf, and use
+ * the new value as the new separator.
*/
- bt_shift_leaf_right(tree, new_leaf, 0, move_count);
- bmov(buf, new_leaf->btl_elems, size);
- bmov(value, buf, size);
- new_hdr->bth_count++;
+ zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
+ bcpy(value, buf, size);
}
/*
@@ -785,18 +893,19 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
kmem_free(buf, size);
}
-static uint64_t
+static uint32_t
zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
{
void *buf;
- if (hdr->bth_core) {
+ if (zfs_btree_is_core(hdr)) {
buf = ((zfs_btree_core_t *)hdr)->btc_elems;
} else {
- buf = ((zfs_btree_leaf_t *)hdr)->btl_elems;
+ buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
+ hdr->bth_first * tree->bt_elem_size;
}
zfs_btree_index_t idx;
zfs_btree_core_t *parent = hdr->bth_parent;
- VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+ VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
@@ -821,9 +930,8 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
zfs_btree_leaf_t *leaf = tree->bt_bulk;
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
zfs_btree_core_t *parent = hdr->bth_parent;
- uint64_t size = tree->bt_elem_size;
- uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
- sizeof (zfs_btree_hdr_t)) / size, 2);
+ size_t size = tree->bt_elem_size;
+ uint32_t capacity = tree->bt_leaf_cap;
/*
* The invariant doesn't apply to the root node, if that's the only
@@ -848,56 +956,54 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
.bti_offset = 0
};
VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
- ASSERT(idx.bti_node->bth_core);
+ ASSERT(zfs_btree_is_core(idx.bti_node));
zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
- uint64_t common_idx = idx.bti_offset;
+ uint32_t common_idx = idx.bti_offset;
VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
- ASSERT(!idx.bti_node->bth_core);
+ ASSERT(!zfs_btree_is_core(idx.bti_node));
zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
zfs_btree_hdr_t *l_hdr = idx.bti_node;
- uint64_t move_count = (capacity / 2) - hdr->bth_count;
+ uint32_t move_count = (capacity / 2) - hdr->bth_count;
ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
capacity / 2);
if (zfs_btree_verify_intensity >= 5) {
- for (int i = 0; i < move_count; i++) {
+ for (uint32_t i = 0; i < move_count; i++) {
zfs_btree_verify_poison_at(tree, hdr,
leaf->btl_hdr.bth_count + i);
}
}
/* First, shift elements in leaf back. */
- bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count,
- BSD_RIGHT);
+ bt_grow_leaf(tree, leaf, 0, move_count);
/* Next, move the separator from the common ancestor to leaf. */
- uint8_t *separator = common->btc_elems + (common_idx * size);
- uint8_t *out = leaf->btl_elems + ((move_count - 1) * size);
- bmov(separator, out, size);
- move_count--;
+ uint8_t *separator = common->btc_elems + common_idx * size;
+ uint8_t *out = leaf->btl_elems +
+ (hdr->bth_first + move_count - 1) * size;
+ bcpy(separator, out, size);
/*
* Now we move elements from the tail of the left neighbor to
* fill the remaining spots in leaf.
*/
bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
- move_count, move_count, leaf, 0);
+ (move_count - 1), move_count - 1, leaf, 0);
/*
* Finally, move the new last element in the left neighbor to
* the separator.
*/
- bmov(l_neighbor->btl_elems + (l_hdr->bth_count -
- move_count - 1) * size, separator, size);
+ bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
+ l_hdr->bth_count - move_count) * size, separator, size);
/* Adjust the node's counts, and we're done. */
- l_hdr->bth_count -= move_count + 1;
- hdr->bth_count += move_count + 1;
+ bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
+ move_count);
ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
ASSERT3U(hdr->bth_count, >=, capacity / 2);
- zfs_btree_poison_node(tree, l_hdr);
}
/*
@@ -921,16 +1027,16 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
* splitting is 2, we never need to worry about not having a
* left sibling (a sibling is a neighbor with the same parent).
*/
- uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+ uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
ASSERT3U(parent_idx, >, 0);
zfs_btree_core_t *l_neighbor =
(zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
- uint64_t move_count = (capacity / 2) - hdr->bth_count;
+ uint32_t move_count = (capacity / 2) - hdr->bth_count;
ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
capacity / 2);
if (zfs_btree_verify_intensity >= 5) {
- for (int i = 0; i < move_count; i++) {
+ for (uint32_t i = 0; i < move_count; i++) {
zfs_btree_verify_poison_at(tree, hdr,
hdr->bth_count + i);
}
@@ -943,14 +1049,14 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
size);
uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
- bmov(separator, e_out, size);
+ bcpy(separator, e_out, size);
/*
* Now, move elements and children from the left node to the
* right. We move one more child than elements.
*/
move_count--;
- uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
+ uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
BSS_TRAPEZOID);
@@ -959,7 +1065,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
* separator's position.
*/
move_idx--;
- bmov(l_neighbor->btc_elems + move_idx * size, separator, size);
+ bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
l_neighbor->btc_hdr.bth_count -= move_count + 1;
hdr->bth_count += move_count + 1;
@@ -969,11 +1075,12 @@ zfs_btree_bulk_finish(zfs_btree_t *tree)
zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
- for (int i = 0; i <= hdr->bth_count; i++)
+ for (uint32_t i = 0; i <= hdr->bth_count; i++)
cur->btc_children[i]->bth_parent = cur;
}
tree->bt_bulk = NULL;
+ zfs_btree_verify(tree);
}
/*
@@ -1006,20 +1113,19 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
ASSERT0(where->bti_offset);
tree->bt_num_nodes++;
- zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
- KM_SLEEP);
+ zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
tree->bt_root = &leaf->btl_hdr;
tree->bt_height++;
zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
hdr->bth_parent = NULL;
- hdr->bth_core = B_FALSE;
+ hdr->bth_first = 0;
hdr->bth_count = 0;
zfs_btree_poison_node(tree, hdr);
zfs_btree_insert_into_leaf(tree, leaf, value, 0);
tree->bt_bulk = leaf;
- } else if (!where->bti_node->bth_core) {
+ } else if (!zfs_btree_is_core(where->bti_node)) {
/*
* If we're inserting into a leaf, go directly to the helper
* function.
@@ -1035,28 +1141,28 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
* value in the node at that spot and then insert the old
* separator into the first slot in the subtree to the right.
*/
- ASSERT(where->bti_node->bth_core);
zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
/*
* We can ignore bti_before, because either way the value
* should end up in bti_offset.
*/
- uint64_t off = where->bti_offset;
+ uint32_t off = where->bti_offset;
zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
size_t size = tree->bt_elem_size;
uint8_t *buf = kmem_alloc(size, KM_SLEEP);
- bmov(node->btc_elems + off * size, buf, size);
- bmov(value, node->btc_elems + off * size, size);
+ bcpy(node->btc_elems + off * size, buf, size);
+ bcpy(value, node->btc_elems + off * size, size);
/*
* Find the first slot in the subtree to the right, insert
* there.
*/
zfs_btree_index_t new_idx;
- VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL);
+ VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
+ NULL);
ASSERT0(new_idx.bti_offset);
- ASSERT(!new_idx.bti_node->bth_core);
+ ASSERT(!zfs_btree_is_core(new_idx.bti_node));
zfs_btree_insert_into_leaf(tree,
(zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
kmem_free(buf, size);
@@ -1075,7 +1181,7 @@ zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
ASSERT0(tree->bt_num_elems);
return (NULL);
}
- return (zfs_btree_first_helper(tree->bt_root, where));
+ return (zfs_btree_first_helper(tree, tree->bt_root, where));
}
/*
@@ -1088,7 +1194,7 @@ zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
{
zfs_btree_hdr_t *node;
- for (node = hdr; node->bth_core; node =
+ for (node = hdr; zfs_btree_is_core(node); node =
((zfs_btree_core_t *)node)->btc_children[node->bth_count])
;
@@ -1098,7 +1204,8 @@ zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
where->bti_offset = node->bth_count - 1;
where->bti_before = B_FALSE;
}
- return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size);
+ return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
+ btree->bt_elem_size);
}
/*
@@ -1131,8 +1238,8 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
return (NULL);
}
- uint64_t offset = idx->bti_offset;
- if (!idx->bti_node->bth_core) {
+ uint32_t offset = idx->bti_offset;
+ if (!zfs_btree_is_core(idx->bti_node)) {
/*
* When finding the next element of an element in a leaf,
* there are two cases. If the element isn't the last one in
@@ -1143,20 +1250,21 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
* separator after our ancestor in its parent.
*/
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
- uint64_t new_off = offset + (idx->bti_before ? 0 : 1);
+ uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
if (leaf->btl_hdr.bth_count > new_off) {
out_idx->bti_node = &leaf->btl_hdr;
out_idx->bti_offset = new_off;
out_idx->bti_before = B_FALSE;
- return (leaf->btl_elems + new_off * tree->bt_elem_size);
+ return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
+ new_off) * tree->bt_elem_size);
}
zfs_btree_hdr_t *prev = &leaf->btl_hdr;
for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
node != NULL; node = node->btc_hdr.bth_parent) {
zfs_btree_hdr_t *hdr = &node->btc_hdr;
- ASSERT(hdr->bth_core);
- uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+ ASSERT(zfs_btree_is_core(hdr));
+ uint32_t i = zfs_btree_find_parent_idx(tree, prev);
if (done_func != NULL)
done_func(tree, prev);
if (i == hdr->bth_count) {
@@ -1178,7 +1286,7 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
}
/* If we were before an element in a core node, return that element. */
- ASSERT(idx->bti_node->bth_core);
+ ASSERT(zfs_btree_is_core(idx->bti_node));
zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
if (idx->bti_before) {
out_idx->bti_before = B_FALSE;
@@ -1190,7 +1298,7 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
* the subtree just to the right of the separator.
*/
zfs_btree_hdr_t *child = node->btc_children[offset + 1];
- return (zfs_btree_first_helper(child, out_idx));
+ return (zfs_btree_first_helper(tree, child, out_idx));
}
/*
@@ -1217,8 +1325,8 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
return (NULL);
}
- uint64_t offset = idx->bti_offset;
- if (!idx->bti_node->bth_core) {
+ uint32_t offset = idx->bti_offset;
+ if (!zfs_btree_is_core(idx->bti_node)) {
/*
* When finding the previous element of an element in a leaf,
* there are two cases. If the element isn't the first one in
@@ -1233,15 +1341,15 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
out_idx->bti_node = &leaf->btl_hdr;
out_idx->bti_offset = offset - 1;
out_idx->bti_before = B_FALSE;
- return (leaf->btl_elems + (offset - 1) *
- tree->bt_elem_size);
+ return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
+ offset - 1) * tree->bt_elem_size);
}
zfs_btree_hdr_t *prev = &leaf->btl_hdr;
for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
node != NULL; node = node->btc_hdr.bth_parent) {
zfs_btree_hdr_t *hdr = &node->btc_hdr;
- ASSERT(hdr->bth_core);
- uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+ ASSERT(zfs_btree_is_core(hdr));
+ uint32_t i = zfs_btree_find_parent_idx(tree, prev);
if (i == 0) {
prev = hdr;
continue;
@@ -1262,7 +1370,7 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
* The previous element from one in a core node is the last element in
* the subtree just to the left of the separator.
*/
- ASSERT(idx->bti_node->bth_core);
+ ASSERT(zfs_btree_is_core(idx->bti_node));
zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
zfs_btree_hdr_t *child = node->btc_children[offset];
return (zfs_btree_last_helper(tree, child, out_idx));
@@ -1279,13 +1387,14 @@ void *
zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
{
ASSERT(!idx->bti_before);
- if (!idx->bti_node->bth_core) {
+ size_t size = tree->bt_elem_size;
+ if (!zfs_btree_is_core(idx->bti_node)) {
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
- return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size);
+ return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
+ idx->bti_offset) * size);
}
- ASSERT(idx->bti_node->bth_core);
zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
- return (node->btc_elems + idx->bti_offset * tree->bt_elem_size);
+ return (node->btc_elems + idx->bti_offset * size);
}
/* Add the given value to the tree. Must not already be in the tree. */
@@ -1302,8 +1411,8 @@ static void
zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
{
tree->bt_num_nodes--;
- if (!node->bth_core) {
- kmem_cache_free(zfs_btree_leaf_cache, node);
+ if (!zfs_btree_is_core(node)) {
+ zfs_btree_leaf_free(tree, node);
} else {
kmem_free(node, sizeof (zfs_btree_core_t) +
BTREE_CORE_ELEMS * tree->bt_elem_size);
@@ -1320,7 +1429,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
zfs_btree_hdr_t *rm_hdr)
{
size_t size = tree->bt_elem_size;
- uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
+ uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
zfs_btree_hdr_t *hdr = &node->btc_hdr;
/*
* If the node is the root node and rm_hdr is one of two children,
@@ -1337,7 +1446,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
return;
}
- uint64_t idx;
+ uint32_t idx;
for (idx = 0; idx <= hdr->bth_count; idx++) {
if (node->btc_children[idx] == rm_hdr)
break;
@@ -1357,7 +1466,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
BSS_PARALLELOGRAM);
hdr->bth_count--;
- zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+ zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
return;
}
@@ -1378,13 +1487,13 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
* implementing in the future for completeness' sake.
*/
zfs_btree_core_t *parent = hdr->bth_parent;
- uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+ uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
parent->btc_children[parent_idx - 1]);
if (l_hdr != NULL && l_hdr->bth_count > min_count) {
/* We can take a node from the left neighbor. */
- ASSERT(l_hdr->bth_core);
+ ASSERT(zfs_btree_is_core(l_hdr));
zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
/*
@@ -1399,20 +1508,19 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
*/
uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
size;
- bmov(separator, node->btc_elems, size);
+ bcpy(separator, node->btc_elems, size);
/* Move the last child of neighbor to our first child slot. */
- zfs_btree_hdr_t **take_child = neighbor->btc_children +
- l_hdr->bth_count;
- bmov(take_child, node->btc_children, sizeof (*take_child));
+ node->btc_children[0] =
+ neighbor->btc_children[l_hdr->bth_count];
node->btc_children[0]->bth_parent = node;
/* Move the last element of neighbor to the separator spot. */
uint8_t *take_elem = neighbor->btc_elems +
(l_hdr->bth_count - 1) * size;
- bmov(take_elem, separator, size);
+ bcpy(take_elem, separator, size);
l_hdr->bth_count--;
- zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+ zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
return;
}
@@ -1420,7 +1528,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
NULL : parent->btc_children[parent_idx + 1]);
if (r_hdr != NULL && r_hdr->bth_count > min_count) {
/* We can take a node from the right neighbor. */
- ASSERT(r_hdr->bth_core);
+ ASSERT(zfs_btree_is_core(r_hdr));
zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
/*
@@ -1435,21 +1543,19 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
* element spot in node.
*/
uint8_t *separator = parent->btc_elems + parent_idx * size;
- bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size,
+ bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
size);
/*
* Move the first child of neighbor to the last child spot in
* node.
*/
- zfs_btree_hdr_t **take_child = neighbor->btc_children;
- bmov(take_child, node->btc_children + hdr->bth_count,
- sizeof (*take_child));
+ node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
node->btc_children[hdr->bth_count]->bth_parent = node;
/* Move the first element of neighbor to the separator spot. */
uint8_t *take_elem = neighbor->btc_elems;
- bmov(take_elem, separator, size);
+ bcpy(take_elem, separator, size);
r_hdr->bth_count--;
/*
@@ -1458,7 +1564,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
*/
bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
BSS_TRAPEZOID);
- zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+ zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
return;
}
@@ -1473,7 +1579,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
* merging.
*/
zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
- uint64_t new_idx = idx;
+ uint32_t new_idx = idx;
if (l_hdr != NULL) {
keep_hdr = l_hdr;
new_rm_hdr = hdr;
@@ -1485,14 +1591,14 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
parent_idx++;
}
- ASSERT(keep_hdr->bth_core);
- ASSERT(new_rm_hdr->bth_core);
+ ASSERT(zfs_btree_is_core(keep_hdr));
+ ASSERT(zfs_btree_is_core(new_rm_hdr));
zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
if (zfs_btree_verify_intensity >= 5) {
- for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) {
+ for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
zfs_btree_verify_poison_at(tree, keep_hdr,
keep_hdr->bth_count + i);
}
@@ -1502,14 +1608,14 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
size;
- bmov(separator, e_out, size);
+ bcpy(separator, e_out, size);
keep_hdr->bth_count++;
/* Move all our elements and children into the left node. */
bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
keep_hdr->bth_count, BSS_TRAPEZOID);
- uint64_t old_count = keep_hdr->bth_count;
+ uint32_t old_count = keep_hdr->bth_count;
/* Update bookkeeping */
keep_hdr->bth_count += new_rm_hdr->bth_count;
@@ -1527,17 +1633,17 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
/* Reparent all our children to point to the left node. */
zfs_btree_hdr_t **new_start = keep->btc_children +
old_count - 1;
- for (int i = 0; i < new_rm_hdr->bth_count + 1; i++)
+ for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
new_start[i]->bth_parent = keep;
- for (int i = 0; i <= keep_hdr->bth_count; i++) {
+ for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
ASSERT3P(keep->btc_children[i], !=, rm_hdr);
}
- zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+ zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
new_rm_hdr->bth_count = 0;
- zfs_btree_node_destroy(tree, new_rm_hdr);
zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
+ zfs_btree_node_destroy(tree, new_rm_hdr);
}
/* Remove the element at the specific location. */
@@ -1546,9 +1652,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
{
size_t size = tree->bt_elem_size;
zfs_btree_hdr_t *hdr = where->bti_node;
- uint64_t idx = where->bti_offset;
- uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
- sizeof (zfs_btree_hdr_t)) / size, 2);
+ uint32_t idx = where->bti_offset;
ASSERT(!where->bti_before);
if (tree->bt_bulk != NULL) {
@@ -1560,7 +1664,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
*/
uint8_t *value = zfs_btree_get(tree, where);
uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
- bmov(value, tmp, size);
+ bcpy(value, tmp, size);
zfs_btree_bulk_finish(tree);
VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
kmem_free(tmp, size);
@@ -1575,14 +1679,14 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
* makes the rebalance logic not need to be recursive both upwards and
* downwards.
*/
- if (hdr->bth_core) {
+ if (zfs_btree_is_core(hdr)) {
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
void *new_value = zfs_btree_last_helper(tree, left_subtree,
where);
ASSERT3P(new_value, !=, NULL);
- bmov(new_value, node->btc_elems + idx * size, size);
+ bcpy(new_value, node->btc_elems + idx * size, size);
hdr = where->bti_node;
idx = where->bti_offset;
@@ -1594,19 +1698,18 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
* elements after the idx to the left. After that, we rebalance if
* needed.
*/
- ASSERT(!hdr->bth_core);
+ ASSERT(!zfs_btree_is_core(hdr));
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
ASSERT3U(hdr->bth_count, >, 0);
- uint64_t min_count = (capacity / 2) - 1;
+ uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
/*
* If we're over the minimum size or this is the root, just overwrite
* the value and return.
*/
if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
- hdr->bth_count--;
- bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx);
+ bt_shrink_leaf(tree, leaf, idx, 1);
if (hdr->bth_parent == NULL) {
ASSERT0(tree->bt_height);
if (hdr->bth_count == 0) {
@@ -1615,8 +1718,6 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
zfs_btree_node_destroy(tree, &leaf->btl_hdr);
}
}
- if (tree->bt_root != NULL)
- zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
zfs_btree_verify(tree);
return;
}
@@ -1636,33 +1737,33 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
* worth implementing in the future for completeness' sake.
*/
zfs_btree_core_t *parent = hdr->bth_parent;
- uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+ uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
parent->btc_children[parent_idx - 1]);
if (l_hdr != NULL && l_hdr->bth_count > min_count) {
/* We can take a node from the left neighbor. */
- ASSERT(!l_hdr->bth_core);
+ ASSERT(!zfs_btree_is_core(l_hdr));
+ zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
/*
* Move our elements back by one spot to make room for the
* stolen element and overwrite the element being removed.
*/
- bt_shift_leaf_right(tree, leaf, 0, idx);
+ bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
+
+ /* Move the separator to our first spot. */
uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
size;
- uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems +
- (l_hdr->bth_count - 1) * size;
- /* Move the separator to our first spot. */
- bmov(separator, leaf->btl_elems, size);
+ bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
/* Move our neighbor's last element to the separator. */
- bmov(take_elem, separator, size);
-
- /* Update the bookkeeping. */
- l_hdr->bth_count--;
- zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+ uint8_t *take_elem = neighbor->btl_elems +
+ (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
+ bcpy(take_elem, separator, size);
+ /* Delete our neighbor's last element. */
+ bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
zfs_btree_verify(tree);
return;
}
@@ -1671,7 +1772,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
NULL : parent->btc_children[parent_idx + 1]);
if (r_hdr != NULL && r_hdr->bth_count > min_count) {
/* We can take a node from the right neighbor. */
- ASSERT(!r_hdr->bth_core);
+ ASSERT(!zfs_btree_is_core(r_hdr));
zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
/*
@@ -1679,96 +1780,81 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
* by one spot to make room for the stolen element and
* overwrite the element being removed.
*/
- bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx -
- 1);
+ bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
+ 1, BSD_LEFT);
- uint8_t *separator = parent->btc_elems + parent_idx * size;
- uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems;
/* Move the separator between us to our last spot. */
- bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size,
- size);
+ uint8_t *separator = parent->btc_elems + parent_idx * size;
+ bcpy(separator, leaf->btl_elems + (hdr->bth_first +
+ hdr->bth_count - 1) * size, size);
/* Move our neighbor's first element to the separator. */
- bmov(take_elem, separator, size);
+ uint8_t *take_elem = neighbor->btl_elems +
+ r_hdr->bth_first * size;
+ bcpy(take_elem, separator, size);
- /* Update the bookkeeping. */
- r_hdr->bth_count--;
-
- /*
- * Move our neighbors elements forwards to overwrite the
- * stolen element.
- */
- bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count);
- zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+ /* Delete our neighbor's first element. */
+ bt_shrink_leaf(tree, neighbor, 0, 1);
zfs_btree_verify(tree);
return;
}
/*
* In this case, neither of our neighbors can spare an element, so we
- * need to merge with one of them. We prefer the left one,
- * arbitrarily. Move the separator into the leftmost merging node
+ * need to merge with one of them. We prefer the left one, arbitrarily.
+ * After remove we move the separator into the leftmost merging node
* (which may be us or the left neighbor), and then move the right
* merging node's elements. Once that's done, we go back and delete
* the element we're removing. Finally, go into the parent and delete
* the right merging node and the separator. This may cause further
* merging.
*/
- zfs_btree_hdr_t *rm_hdr, *keep_hdr;
- uint64_t new_idx = idx;
+ zfs_btree_hdr_t *rm_hdr, *k_hdr;
if (l_hdr != NULL) {
- keep_hdr = l_hdr;
+ k_hdr = l_hdr;
rm_hdr = hdr;
- new_idx += keep_hdr->bth_count + 1; // 449
} else {
ASSERT3P(r_hdr, !=, NULL);
- keep_hdr = hdr;
+ k_hdr = hdr;
rm_hdr = r_hdr;
parent_idx++;
}
-
- ASSERT(!keep_hdr->bth_core);
- ASSERT(!rm_hdr->bth_core);
- ASSERT3U(keep_hdr->bth_count, ==, min_count);
+ ASSERT(!zfs_btree_is_core(k_hdr));
+ ASSERT(!zfs_btree_is_core(rm_hdr));
+ ASSERT3U(k_hdr->bth_count, ==, min_count);
ASSERT3U(rm_hdr->bth_count, ==, min_count);
-
- zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr;
+ zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
if (zfs_btree_verify_intensity >= 5) {
- for (int i = 0; i < rm_hdr->bth_count + 1; i++) {
- zfs_btree_verify_poison_at(tree, keep_hdr,
- keep_hdr->bth_count + i);
+ for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
+ zfs_btree_verify_poison_at(tree, k_hdr,
+ k_hdr->bth_count + i);
}
}
+
/*
- * Move the separator into the first open spot in the left
- * neighbor.
+ * Remove the value from the node. It will go below the minimum,
+ * but we'll fix it in no time.
*/
- uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size;
- uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
- size;
- bmov(separator, out, size);
- keep_hdr->bth_count++;
+ bt_shrink_leaf(tree, leaf, idx, 1);
- /* Move our elements to the left neighbor. */
- bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep,
- keep_hdr->bth_count);
+ /* Prepare space for elements to be moved from the right. */
+ uint32_t k_count = k_hdr->bth_count;
+ bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
+ ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
- /* Update the bookkeeping. */
- keep_hdr->bth_count += rm_hdr->bth_count;
- ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1);
+ /* Move the separator into the first open spot. */
+ uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
+ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
+ bcpy(separator, out, size);
- /* Remove the value from the node */
- keep_hdr->bth_count--;
- bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count -
- new_idx);
- zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+ /* Move our elements to the left neighbor. */
+ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
- rm_hdr->bth_count = 0;
- zfs_btree_node_destroy(tree, rm_hdr);
/* Remove the emptied node from the parent. */
zfs_btree_remove_from_node(tree, parent, rm_hdr);
+ zfs_btree_node_destroy(tree, rm_hdr);
zfs_btree_verify(tree);
}
@@ -1831,11 +1917,10 @@ zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
static void
zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
{
- if (hdr->bth_core) {
+ if (zfs_btree_is_core(hdr)) {
zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
- for (int i = 0; i <= hdr->bth_count; i++) {
+ for (uint32_t i = 0; i <= hdr->bth_count; i++)
zfs_btree_clear_helper(tree, btc->btc_children[i]);
- }
}
zfs_btree_node_destroy(tree, hdr);
@@ -1868,11 +1953,11 @@ zfs_btree_destroy(zfs_btree_t *tree)
static void
zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
{
- if (!hdr->bth_core)
+ if (!zfs_btree_is_core(hdr))
return;
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
- for (int i = 0; i <= hdr->bth_count; i++) {
+ for (uint32_t i = 0; i <= hdr->bth_count; i++) {
VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
}
@@ -1897,11 +1982,10 @@ zfs_btree_verify_pointers(zfs_btree_t *tree)
static uint64_t
zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
{
- if (!hdr->bth_core) {
- if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) {
- uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
- sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2);
- VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1);
+ if (!zfs_btree_is_core(hdr)) {
+ if (tree->bt_root != hdr && tree->bt_bulk &&
+ hdr != &tree->bt_bulk->btl_hdr) {
+ VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
}
return (hdr->bth_count);
@@ -1911,7 +1995,7 @@ zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
uint64_t ret = hdr->bth_count;
if (tree->bt_root != hdr && tree->bt_bulk == NULL)
VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
- for (int i = 0; i <= hdr->bth_count; i++) {
+ for (uint32_t i = 0; i <= hdr->bth_count; i++) {
ret += zfs_btree_verify_counts_helper(tree,
node->btc_children[i]);
}
@@ -1941,17 +2025,16 @@ zfs_btree_verify_counts(zfs_btree_t *tree)
*/
static uint64_t
zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
- int64_t height)
+ int32_t height)
{
- if (!hdr->bth_core) {
+ if (!zfs_btree_is_core(hdr)) {
VERIFY0(height);
return (1);
}
- VERIFY(hdr->bth_core);
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
uint64_t ret = 1;
- for (int i = 0; i <= hdr->bth_count; i++) {
+ for (uint32_t i = 0; i <= hdr->bth_count; i++) {
ret += zfs_btree_verify_height_helper(tree,
node->btc_children[i], height - 1);
}
@@ -1983,24 +2066,26 @@ static void
zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
{
size_t size = tree->bt_elem_size;
- if (!hdr->bth_core) {
+ if (!zfs_btree_is_core(hdr)) {
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
- for (int i = 1; i < hdr->bth_count; i++) {
- VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) *
- size, leaf->btl_elems + i * size), ==, -1);
+ for (uint32_t i = 1; i < hdr->bth_count; i++) {
+ VERIFY3S(tree->bt_compar(leaf->btl_elems +
+ (hdr->bth_first + i - 1) * size,
+ leaf->btl_elems +
+ (hdr->bth_first + i) * size), ==, -1);
}
return;
}
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
- for (int i = 1; i < hdr->bth_count; i++) {
+ for (uint32_t i = 1; i < hdr->bth_count; i++) {
VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
node->btc_elems + i * size), ==, -1);
}
- for (int i = 0; i < hdr->bth_count; i++) {
+ for (uint32_t i = 0; i < hdr->bth_count; i++) {
uint8_t *left_child_last = NULL;
zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
- if (left_child_hdr->bth_core) {
+ if (zfs_btree_is_core(left_child_hdr)) {
zfs_btree_core_t *left_child =
(zfs_btree_core_t *)left_child_hdr;
left_child_last = left_child->btc_elems +
@@ -2009,40 +2094,39 @@ zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
zfs_btree_leaf_t *left_child =
(zfs_btree_leaf_t *)left_child_hdr;
left_child_last = left_child->btl_elems +
- (left_child_hdr->bth_count - 1) * size;
+ (left_child_hdr->bth_first +
+ left_child_hdr->bth_count - 1) * size;
}
- if (tree->bt_compar(node->btc_elems + i * size,
- left_child_last) != 1) {
+ int comp = tree->bt_compar(node->btc_elems + i * size,
+ left_child_last);
+ if (comp <= 0) {
panic("btree: compar returned %d (expected 1) at "
- "%px %d: compar(%px, %px)", tree->bt_compar(
- node->btc_elems + i * size, left_child_last),
- (void *)node, i, (void *)(node->btc_elems + i *
- size), (void *)left_child_last);
+ "%px %d: compar(%px, %px)", comp, node, i,
+ node->btc_elems + i * size, left_child_last);
}
uint8_t *right_child_first = NULL;
zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
- if (right_child_hdr->bth_core) {
+ if (zfs_btree_is_core(right_child_hdr)) {
zfs_btree_core_t *right_child =
(zfs_btree_core_t *)right_child_hdr;
right_child_first = right_child->btc_elems;
} else {
zfs_btree_leaf_t *right_child =
(zfs_btree_leaf_t *)right_child_hdr;
- right_child_first = right_child->btl_elems;
+ right_child_first = right_child->btl_elems +
+ right_child_hdr->bth_first * size;
}
- if (tree->bt_compar(node->btc_elems + i * size,
- right_child_first) != -1) {
+ comp = tree->bt_compar(node->btc_elems + i * size,
+ right_child_first);
+ if (comp >= 0) {
panic("btree: compar returned %d (expected -1) at "
- "%px %d: compar(%px, %px)", tree->bt_compar(
- node->btc_elems + i * size, right_child_first),
- (void *)node, i, (void *)(node->btc_elems + i *
- size), (void *)right_child_first);
+ "%px %d: compar(%px, %px)", comp, node, i,
+ node->btc_elems + i * size, right_child_first);
}
}
- for (int i = 0; i <= hdr->bth_count; i++) {
+ for (uint32_t i = 0; i <= hdr->bth_count; i++)
zfs_btree_verify_order_helper(tree, node->btc_children[i]);
- }
}
/* Check that all elements in the tree are in sorted order. */
@@ -2063,27 +2147,28 @@ static void
zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
{
size_t size = tree->bt_elem_size;
- if (!hdr->bth_core) {
+ if (!zfs_btree_is_core(hdr)) {
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
- uint8_t val = 0x0f;
- for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE -
- sizeof (zfs_btree_hdr_t); i++) {
- VERIFY3U(leaf->btl_elems[i], ==, val);
- }
+ for (size_t i = 0; i < hdr->bth_first * size; i++)
+ VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
+ size_t esize = tree->bt_leaf_size -
+ offsetof(zfs_btree_leaf_t, btl_elems);
+ for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
+ i < esize; i++)
+ VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
} else {
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
- uint8_t val = 0x0f;
- for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size;
- i++) {
- VERIFY3U(node->btc_elems[i], ==, val);
- }
+ for (size_t i = hdr->bth_count * size;
+ i < BTREE_CORE_ELEMS * size; i++)
+ VERIFY3U(node->btc_elems[i], ==, 0x0f);
- for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+ for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
+ i++) {
VERIFY3P(node->btc_children[i], ==,
(zfs_btree_hdr_t *)BTREE_POISON);
}
- for (int i = 0; i <= hdr->bth_count; i++) {
+ for (uint32_t i = 0; i <= hdr->bth_count; i++) {
zfs_btree_verify_poison_helper(tree,
node->btc_children[i]);
}
@@ -2122,3 +2207,9 @@ zfs_btree_verify(zfs_btree_t *tree)
return;
zfs_btree_verify_poison(tree);
}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
+ "Enable btree verification. Levels above 4 require ZFS be built "
+ "with debugging");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
index 3fbb24ddef5e..2ac058fd2c93 100644
--- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c
+++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -37,18 +37,37 @@ static dataset_kstat_values_t empty_dataset_kstats = {
{ "nread", KSTAT_DATA_UINT64 },
{ "nunlinks", KSTAT_DATA_UINT64 },
{ "nunlinked", KSTAT_DATA_UINT64 },
+ {
+ { "zil_commit_count", KSTAT_DATA_UINT64 },
+ { "zil_commit_writer_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_indirect_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_copied_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }
+ }
};
static int
dataset_kstats_update(kstat_t *ksp, int rw)
{
dataset_kstats_t *dk = ksp->ks_private;
- ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data);
+ dataset_kstat_values_t *dkv = ksp->ks_data;
+ ASSERT3P(dk->dk_kstats->ks_data, ==, dkv);
if (rw == KSTAT_WRITE)
return (EACCES);
- dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
dkv->dkv_writes.value.ui64 =
wmsum_value(&dk->dk_sums.dss_writes);
dkv->dkv_nwritten.value.ui64 =
@@ -62,10 +81,12 @@ dataset_kstats_update(kstat_t *ksp, int rw)
dkv->dkv_nunlinked.value.ui64 =
wmsum_value(&dk->dk_sums.dss_nunlinked);
+ zil_kstat_values_update(&dkv->dkv_zil_stats, &dk->dk_zil_sums);
+
return (0);
}
-void
+int
dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
{
/*
@@ -75,7 +96,7 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
* a filesystem with many snapshots, we skip them for now.
*/
if (dmu_objset_is_snapshot(objset))
- return;
+ return (0);
/*
* At the time of this writing, KSTAT_STRLEN is 255 in Linux,
@@ -94,13 +115,13 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
" snprintf() for kstat module name returned %d",
(unsigned long long)dmu_objset_id(objset), n);
- return;
+ return (SET_ERROR(EINVAL));
} else if (n >= KSTAT_STRLEN) {
zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
"kstat module name length (%d) exceeds limit (%d)",
(unsigned long long)dmu_objset_id(objset),
n, KSTAT_STRLEN);
- return;
+ return (SET_ERROR(ENAMETOOLONG));
}
char kstat_name[KSTAT_STRLEN];
@@ -110,20 +131,25 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
" snprintf() for kstat name returned %d",
(unsigned long long)dmu_objset_id(objset), n);
- return;
+ return (SET_ERROR(EINVAL));
+ } else if (n >= KSTAT_STRLEN) {
+ zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+ "kstat name length (%d) exceeds limit (%d)",
+ (unsigned long long)dmu_objset_id(objset),
+ n, KSTAT_STRLEN);
+ return (SET_ERROR(ENAMETOOLONG));
}
- ASSERT3U(n, <, KSTAT_STRLEN);
kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
"dataset", KSTAT_TYPE_NAMED,
sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (kstat == NULL)
- return;
+ return (SET_ERROR(ENOMEM));
dataset_kstat_values_t *dk_kstats =
kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
- bcopy(&empty_dataset_kstats, dk_kstats,
+ memcpy(dk_kstats, &empty_dataset_kstats,
sizeof (empty_dataset_kstats));
char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
@@ -137,15 +163,17 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
kstat->ks_private = dk;
kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN;
- kstat_install(kstat);
- dk->dk_kstats = kstat;
-
wmsum_init(&dk->dk_sums.dss_writes, 0);
wmsum_init(&dk->dk_sums.dss_nwritten, 0);
wmsum_init(&dk->dk_sums.dss_reads, 0);
wmsum_init(&dk->dk_sums.dss_nread, 0);
wmsum_init(&dk->dk_sums.dss_nunlinks, 0);
wmsum_init(&dk->dk_sums.dss_nunlinked, 0);
+ zil_sums_init(&dk->dk_zil_sums);
+
+ dk->dk_kstats = kstat;
+ kstat_install(kstat);
+ return (0);
}
void
@@ -155,19 +183,31 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
return;
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+ kstat_delete(dk->dk_kstats);
+ dk->dk_kstats = NULL;
kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
kmem_free(dkv, sizeof (empty_dataset_kstats));
- kstat_delete(dk->dk_kstats);
- dk->dk_kstats = NULL;
-
wmsum_fini(&dk->dk_sums.dss_writes);
wmsum_fini(&dk->dk_sums.dss_nwritten);
wmsum_fini(&dk->dk_sums.dss_reads);
wmsum_fini(&dk->dk_sums.dss_nread);
wmsum_fini(&dk->dk_sums.dss_nunlinks);
wmsum_fini(&dk->dk_sums.dss_nunlinked);
+ zil_sums_fini(&dk->dk_zil_sums);
+}
+
+void
+dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
+{
+ dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+ char *ds_name;
+
+ ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name);
+ ASSERT3S(ds_name, !=, NULL);
+ (void) strlcpy(ds_name, name,
+ KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
}
void
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index fe54da425286..56fe2c4dbe30 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -26,6 +26,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/zfs_context.h>
@@ -49,13 +50,14 @@
#include <sys/trace_zfs.h>
#include <sys/callb.h>
#include <sys/abd.h>
+#include <sys/brt.h>
#include <sys/vdev.h>
#include <cityhash.h>
#include <sys/spa_impl.h>
#include <sys/wmsum.h>
#include <sys/vdev_impl.h>
-kstat_t *dbuf_ksp;
+static kstat_t *dbuf_ksp;
typedef struct dbuf_stats {
/*
@@ -100,6 +102,11 @@ typedef struct dbuf_stats {
*/
kstat_named_t hash_insert_race;
/*
+ * Number of entries in the hash table dbuf and mutex arrays.
+ */
+ kstat_named_t hash_table_count;
+ kstat_named_t hash_mutex_count;
+ /*
* Statistics about the size of the metadata dbuf cache.
*/
kstat_named_t metadata_cache_count;
@@ -131,6 +138,8 @@ dbuf_stats_t dbuf_stats = {
{ "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 },
{ "hash_insert_race", KSTAT_DATA_UINT64 },
+ { "hash_table_count", KSTAT_DATA_UINT64 },
+ { "hash_mutex_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
@@ -152,13 +161,13 @@ struct {
} dbuf_sums;
#define DBUF_STAT_INCR(stat, val) \
- wmsum_add(&dbuf_sums.stat, val);
+ wmsum_add(&dbuf_sums.stat, val)
#define DBUF_STAT_DECR(stat, val) \
- DBUF_STAT_INCR(stat, -(val));
+ DBUF_STAT_INCR(stat, -(val))
#define DBUF_STAT_BUMP(stat) \
- DBUF_STAT_INCR(stat, 1);
+ DBUF_STAT_INCR(stat, 1)
#define DBUF_STAT_BUMPDOWN(stat) \
- DBUF_STAT_INCR(stat, -1);
+ DBUF_STAT_INCR(stat, -1)
#define DBUF_STAT_MAX(stat, v) { \
uint64_t _m; \
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
@@ -166,15 +175,8 @@ struct {
continue; \
}
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
-static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
-
-extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
- dmu_buf_evict_func_t *evict_func_sync,
- dmu_buf_evict_func_t *evict_func_async,
- dmu_buf_t **clear_on_evict_dbufp);
/*
* Global data structures and functions for the dbuf cache.
@@ -225,12 +227,15 @@ typedef struct dbuf_cache {
dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
/* Size limits for the caches */
-unsigned long dbuf_cache_max_bytes = ULONG_MAX;
-unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
+static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
/* Set the default sizes of the caches to log2 fraction of arc size */
-int dbuf_cache_shift = 5;
-int dbuf_metadata_cache_shift = 6;
+static uint_t dbuf_cache_shift = 5;
+static uint_t dbuf_metadata_cache_shift = 6;
+
+/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
+static uint_t dbuf_mutex_cache_shift = 0;
static unsigned long dbuf_cache_target_bytes(void);
static unsigned long dbuf_metadata_cache_target_bytes(void);
@@ -277,18 +282,18 @@ static unsigned long dbuf_metadata_cache_target_bytes(void);
/*
* The percentage above and below the maximum cache size.
*/
-uint_t dbuf_cache_hiwater_pct = 10;
-uint_t dbuf_cache_lowater_pct = 10;
+static uint_t dbuf_cache_hiwater_pct = 10;
+static uint_t dbuf_cache_lowater_pct = 10;
-/* ARGSUSED */
static int
dbuf_cons(void *vdb, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
dmu_buf_impl_t *db = vdb;
- bzero(db, sizeof (dmu_buf_impl_t));
+ memset(db, 0, sizeof (dmu_buf_impl_t));
- mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
+ rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
multilist_link_init(&db->db_cache_link);
zfs_refcount_create(&db->db_holds);
@@ -296,10 +301,10 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
return (0);
}
-/* ARGSUSED */
static void
dbuf_dest(void *vdb, void *unused)
{
+ (void) unused;
dmu_buf_impl_t *db = vdb;
mutex_destroy(&db->db_mtx);
rw_destroy(&db->db_rwlock);
@@ -334,7 +339,8 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
(dbuf)->db_blkid == (blkid))
dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
+ uint64_t *hash_out)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
uint64_t hv;
@@ -356,6 +362,8 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
}
}
mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ if (hash_out != NULL)
+ *hash_out = hv;
return (NULL);
}
@@ -390,13 +398,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
objset_t *os = db->db_objset;
uint64_t obj = db->db.db_object;
int level = db->db_level;
- uint64_t blkid, hv, idx;
+ uint64_t blkid, idx;
dmu_buf_impl_t *dbf;
uint32_t i;
blkid = db->db_blkid;
- hv = dbuf_hash(os, obj, level, blkid);
- idx = hv & h->hash_table_mask;
+ ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
+ idx = db->db_hash & h->hash_table_mask;
mutex_enter(DBUF_HASH_MUTEX(h, idx));
for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
@@ -470,12 +478,12 @@ static void
dbuf_hash_remove(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv, idx;
+ uint64_t idx;
dmu_buf_impl_t *dbf, **dbp;
- hv = dbuf_hash(db->db_objset, db->db.db_object,
- db->db_level, db->db_blkid);
- idx = hv & h->hash_table_mask;
+ ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
+ db->db_blkid), ==, db->db_hash);
+ idx = db->db_hash & h->hash_table_mask;
/*
* We mustn't hold db_mtx to maintain lock ordering:
@@ -560,6 +568,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif
+ if (db->db_caching_status != DB_NO_CACHE) {
+ /*
+ * This is a cached dbuf, so the size of the user data is
+ * included in its cached amount. We adjust it here because the
+ * user data has already been detached from the dbuf, and the
+ * sync functions are not supposed to touch it (the dbuf might
+ * not exist anymore by the time the sync functions run.
+ */
+ uint64_t size = dbu->dbu_size;
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size, size, dbu);
+ if (db->db_caching_status == DB_DBUF_CACHE)
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+ }
+
/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
@@ -607,58 +630,58 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
boolean_t
dbuf_is_l2cacheable(dmu_buf_impl_t *db)
{
- vdev_t *vd = NULL;
- zfs_cache_type_t cache = db->db_objset->os_secondary_cache;
- blkptr_t *bp = db->db_blkptr;
-
- if (bp != NULL && !BP_IS_HOLE(bp)) {
+ if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+ (db->db_objset->os_secondary_cache ==
+ ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
+ if (l2arc_exclude_special == 0)
+ return (B_TRUE);
+
+ blkptr_t *bp = db->db_blkptr;
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (B_FALSE);
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+ vdev_t *vd = NULL;
if (vdev < rvd->vdev_children)
vd = rvd->vdev_child[vdev];
- if (cache == ZFS_CACHE_ALL ||
- (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) {
- if (vd == NULL)
- return (B_TRUE);
+ if (vd == NULL)
+ return (B_TRUE);
- if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
- vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
- l2arc_exclude_special == 0)
- return (B_TRUE);
- }
+ if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+ vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+ return (B_TRUE);
}
-
return (B_FALSE);
}
static inline boolean_t
dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
{
- vdev_t *vd = NULL;
- zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache;
-
- if (bp != NULL && !BP_IS_HOLE(bp)) {
+ if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+ (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
+ (level > 0 ||
+ DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
+ if (l2arc_exclude_special == 0)
+ return (B_TRUE);
+
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (B_FALSE);
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+ vdev_t *vd = NULL;
if (vdev < rvd->vdev_children)
vd = rvd->vdev_child[vdev];
- if (cache == ZFS_CACHE_ALL || ((level > 0 ||
- DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) &&
- cache == ZFS_CACHE_METADATA)) {
- if (vd == NULL)
- return (B_TRUE);
+ if (vd == NULL)
+ return (B_TRUE);
- if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
- vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
- l2arc_exclude_special == 0)
- return (B_TRUE);
- }
+ if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+ vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+ return (B_TRUE);
}
-
return (B_FALSE);
}
@@ -745,7 +768,7 @@ static void
dbuf_evict_one(void)
{
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
- multilist_sublist_t *mls = multilist_sublist_lock(
+ multilist_sublist_t *mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
@@ -761,12 +784,15 @@ dbuf_evict_one(void)
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
+ uint64_t size = db->db.db_size;
+ uint64_t usize = dmu_buf_user_size(&db->db);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[DB_DBUF_CACHE].size, size, db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+ &dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
@@ -783,10 +809,10 @@ dbuf_evict_one(void)
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
* out of the cache it is destroyed and becomes eligible for arc eviction.
*/
-/* ARGSUSED */
-static void
+static __attribute__((noreturn)) void
dbuf_evict_thread(void *unused)
{
+ (void) unused;
callb_cpr_t cpr;
CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
@@ -822,7 +848,7 @@ dbuf_evict_thread(void *unused)
/*
* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
* If the dbuf cache is at its high water mark, then evict a dbuf from the
- * dbuf cache using the callers context.
+ * dbuf cache using the caller's context.
*/
static void
dbuf_evict_notify(uint64_t size)
@@ -843,6 +869,7 @@ static int
dbuf_kstat_update(kstat_t *ksp, int rw)
{
dbuf_stats_t *ds = ksp->ks_data;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
if (rw == KSTAT_WRITE)
return (SET_ERROR(EACCES));
@@ -872,6 +899,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
wmsum_value(&dbuf_sums.hash_chains);
ds->hash_insert_race.value.ui64 =
wmsum_value(&dbuf_sums.hash_insert_race);
+ ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
+ ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
ds->metadata_cache_count.value.ui64 =
wmsum_value(&dbuf_sums.metadata_cache_count);
ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
@@ -884,9 +913,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw)
void
dbuf_init(void)
{
- uint64_t hsize = 1ULL << 16;
+ uint64_t hmsize, hsize = 1ULL << 16;
dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
/*
* The hash table is big enough to fill one eighth of physical memory
@@ -897,30 +925,43 @@ dbuf_init(void)
while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
hsize <<= 1;
-retry:
- h->hash_table_mask = hsize - 1;
-#if defined(_KERNEL)
+ h->hash_table = NULL;
+ while (h->hash_table == NULL) {
+ h->hash_table_mask = hsize - 1;
+
+ h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+ if (h->hash_table == NULL)
+ hsize >>= 1;
+
+ ASSERT3U(hsize, >=, 1ULL << 10);
+ }
+
/*
- * Large allocations which do not require contiguous pages
- * should be using vmem_alloc() in the linux kernel
+ * The hash table buckets are protected by an array of mutexes where
+ * each mutex is reponsible for protecting 128 buckets. A minimum
+ * array size of 8192 is targeted to avoid contention.
*/
- h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
-#else
- h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-#endif
- if (h->hash_table == NULL) {
- /* XXX - we should really return an error instead of assert */
- ASSERT(hsize > (1ULL << 10));
- hsize >>= 1;
- goto retry;
+ if (dbuf_mutex_cache_shift == 0)
+ hmsize = MAX(hsize >> 7, 1ULL << 13);
+ else
+ hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
+
+ h->hash_mutexes = NULL;
+ while (h->hash_mutexes == NULL) {
+ h->hash_mutex_mask = hmsize - 1;
+
+ h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
+ KM_SLEEP);
+ if (h->hash_mutexes == NULL)
+ hmsize >>= 1;
}
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
- for (i = 0; i < DBUF_MUTEXES; i++)
- mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+ for (int i = 0; i < hmsize; i++)
+ mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);
dbuf_stats_init(h);
@@ -946,7 +987,7 @@ retry:
wmsum_init(&dbuf_sums.cache_count, 0);
wmsum_init(&dbuf_sums.cache_total_evicts, 0);
- for (i = 0; i < DN_MAX_LEVELS; i++) {
+ for (int i = 0; i < DN_MAX_LEVELS; i++) {
wmsum_init(&dbuf_sums.cache_levels[i], 0);
wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
}
@@ -962,7 +1003,7 @@ retry:
KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (dbuf_ksp != NULL) {
- for (i = 0; i < DN_MAX_LEVELS; i++) {
+ for (int i = 0; i < DN_MAX_LEVELS; i++) {
snprintf(dbuf_stats.cache_levels[i].name,
KSTAT_STRLEN, "cache_level_%d", i);
dbuf_stats.cache_levels[i].data_type =
@@ -982,21 +1023,16 @@ void
dbuf_fini(void)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
dbuf_stats_destroy();
- for (i = 0; i < DBUF_MUTEXES; i++)
+ for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
mutex_destroy(&h->hash_mutexes[i]);
-#if defined(_KERNEL)
- /*
- * Large allocations which do not require contiguous pages
- * should be using vmem_free() in the linux kernel
- */
+
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#else
- kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#endif
+ vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
+ sizeof (kmutex_t));
+
kmem_cache_destroy(dbuf_kmem_cache);
taskq_destroy(dbu_evict_taskq);
@@ -1023,7 +1059,7 @@ dbuf_fini(void)
wmsum_fini(&dbuf_sums.cache_count);
wmsum_fini(&dbuf_sums.cache_total_evicts);
- for (i = 0; i < DN_MAX_LEVELS; i++) {
+ for (int i = 0; i < DN_MAX_LEVELS; i++) {
wmsum_fini(&dbuf_sums.cache_levels[i]);
wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
}
@@ -1137,7 +1173,7 @@ dbuf_verify(dmu_buf_impl_t *db)
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
(db->db_buf == NULL || db->db_buf->b_data) &&
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
- db->db_state != DB_FILL && !dn->dn_free_txg) {
+ db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
/*
* If the blkptr isn't set but they have nonzero data,
* it had better be dirty, otherwise we'll lose that
@@ -1183,7 +1219,7 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT0(bp->blk_pad[1]);
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(BP_IS_HOLE(bp));
- ASSERT0(bp->blk_phys_birth);
+ ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
}
}
}
@@ -1240,7 +1276,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
mutex_exit(&db->db_mtx);
abuf = arc_loan_buf(spa, B_FALSE, blksz);
- bcopy(db->db.db_data, abuf->b_data, blksz);
+ memcpy(abuf->b_data, db->db.db_data, blksz);
} else {
abuf = db->db_buf;
arc_loan_inuse_buf(abuf, db);
@@ -1302,7 +1338,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
* used when modifying or reading db_blkptr.
*/
db_lock_type_t
-dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
+dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
{
enum db_lock_type ret = DLT_NONE;
if (db->db_parent != NULL) {
@@ -1327,7 +1363,7 @@ dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
* panic if we didn't pass the lock type in.
*/
void
-dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
+dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
{
if (type == DLT_PARENT)
rw_exit(&db->db_parent->db_rwlock);
@@ -1339,6 +1375,7 @@ static void
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *vdb)
{
+ (void) zb, (void) bp;
dmu_buf_impl_t *db = vdb;
mutex_enter(&db->db_mtx);
@@ -1360,7 +1397,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
/* freed in flight */
ASSERT(zio == NULL || zio->io_error == 0);
arc_release(buf, db);
- bzero(buf->b_data, db->db.db_size);
+ memset(buf->b_data, 0, db->db.db_size);
arc_buf_freeze(buf);
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
@@ -1383,13 +1420,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
* a decrypted block. Otherwise success.
*/
static int
-dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
{
- int bonuslen, max_bonuslen, err;
-
- err = dbuf_read_verify_dnode_crypt(db, flags);
- if (err)
- return (err);
+ int bonuslen, max_bonuslen;
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@@ -1399,16 +1432,16 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
if (bonuslen < max_bonuslen)
- bzero(db->db.db_data, max_bonuslen);
+ memset(db->db.db_data, 0, max_bonuslen);
if (bonuslen)
- bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "bonus buffer filled");
return (0);
}
static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
{
blkptr_t *bps = db->db.db_data;
uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1417,12 +1450,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
for (int i = 0; i < n_bps; i++) {
blkptr_t *bp = &bps[i];
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
- BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
- dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
- BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
- BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
- BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+ ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+ BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+ dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+ BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+ BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+ BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
}
}
@@ -1432,30 +1465,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
* was taken, ENOENT if no action was taken.
*/
static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+ int is_hole = bp == NULL || BP_IS_HOLE(bp);
/*
* For level 0 blocks only, if the above check fails:
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
* processes the delete record and clears the bp while we are waiting
* for the dn_mtx (resulting in a "no" from block_freed).
*/
- if (!is_hole && db->db_level == 0) {
- is_hole = dnode_block_freed(dn, db->db_blkid) ||
- BP_IS_HOLE(db->db_blkptr);
- }
+ if (!is_hole && db->db_level == 0)
+ is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
if (is_hole) {
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
- if (db->db_blkptr != NULL && db->db_level > 0 &&
- BP_IS_HOLE(db->db_blkptr) &&
- db->db_blkptr->blk_birth != 0) {
- dbuf_handle_indirect_hole(db, dn);
+ if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+ BP_GET_LOGICAL_BIRTH(bp) != 0) {
+ dbuf_handle_indirect_hole(db, dn, bp);
}
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1477,32 +1507,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
*/
static int
-dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
{
- int err = 0;
objset_t *os = db->db_objset;
- arc_buf_t *dnode_abuf;
- dnode_t *dn;
+ dmu_buf_impl_t *dndb;
+ arc_buf_t *dnbuf;
zbookmark_phys_t zb;
+ int err;
- ASSERT(MUTEX_HELD(&db->db_mtx));
+ if ((flags & DB_RF_NO_DECRYPT) != 0 ||
+ !os->os_encrypted || os->os_raw_receive ||
+ (dndb = dn->dn_dbuf) == NULL)
+ return (0);
- if (!os->os_encrypted || os->os_raw_receive ||
- (flags & DB_RF_NO_DECRYPT) != 0)
+ dnbuf = dndb->db_buf;
+ if (!arc_is_encrypted(dnbuf))
return (0);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
+ mutex_enter(&dndb->db_mtx);
- if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
- DB_DNODE_EXIT(db);
- return (0);
- }
+ /*
+ * Since dnode buffer is modified by sync process, there can be only
+ * one copy of it. It means we can not modify (decrypt) it while it
+ * is being written. I don't see how this may happen now, since
+ * encrypted dnode writes by receive should be completed before any
+ * plain-text reads due to txg wait, but better be safe than sorry.
+ */
+ while (1) {
+ if (!arc_is_encrypted(dnbuf)) {
+ mutex_exit(&dndb->db_mtx);
+ return (0);
+ }
+ dbuf_dirty_record_t *dr = dndb->db_data_pending;
+ if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
+ break;
+ cv_wait(&dndb->db_changed, &dndb->db_mtx);
+ };
SET_BOOKMARK(&zb, dmu_objset_id(os),
- DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
- err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+ DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
+ err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
/*
* An error code of EACCES tells us that the key is still not
@@ -1515,7 +1559,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
err = 0;
- DB_DNODE_EXIT(db);
+ mutex_exit(&dndb->db_mtx);
return (err);
}
@@ -1525,39 +1569,63 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
* returning.
*/
static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
- db_lock_type_t dblt, void *tag)
+dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
+ db_lock_type_t dblt, const void *tag)
{
- dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err, zio_flags;
+ blkptr_t bp, *bpp = NULL;
- err = zio_flags = 0;
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_buf == NULL);
ASSERT(db->db_parent == NULL ||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
if (db->db_blkid == DMU_BONUS_BLKID) {
- err = dbuf_read_bonus(db, dn, flags);
+ err = dbuf_read_bonus(db, dn);
goto early_unlock;
}
- err = dbuf_read_hole(db, dn, flags);
+ /*
+ * If we have a pending block clone, we don't want to read the
+ * underlying block, but the content of the block being cloned,
+ * pointed by the dirty record, so we have the most recent data.
+ * If there is no dirty record, then we hit a race in a sync
+ * process when the dirty record is already removed, while the
+ * dbuf is not yet destroyed. Such case is equivalent to uncached.
+ */
+ if (db->db_state == DB_NOFILL) {
+ dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+ if (dr != NULL) {
+ if (!dr->dt.dl.dr_brtwrite) {
+ err = EIO;
+ goto early_unlock;
+ }
+ bp = dr->dt.dl.dr_overridden_by;
+ bpp = &bp;
+ }
+ }
+
+ if (bpp == NULL && db->db_blkptr != NULL) {
+ bp = *db->db_blkptr;
+ bpp = &bp;
+ }
+
+ err = dbuf_read_hole(db, dn, bpp);
if (err == 0)
goto early_unlock;
+ ASSERT(bpp != NULL);
+
/*
* Any attempt to read a redacted block should result in an error. This
* will never happen under normal conditions, but can be useful for
* debugging purposes.
*/
- if (BP_IS_REDACTED(db->db_blkptr)) {
+ if (BP_IS_REDACTED(bpp)) {
ASSERT(dsl_dataset_feature_is_active(
db->db_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
@@ -1572,25 +1640,20 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
- if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
- spa_log_error(db->db_objset->os_spa, &zb);
- zfs_panic_recover("unencrypted block in encrypted "
- "object set %llu", dmu_objset_id(db->db_objset));
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+ spa_log_error(db->db_objset->os_spa, &zb,
+ BP_GET_LOGICAL_BIRTH(bpp));
err = SET_ERROR(EIO);
goto early_unlock;
}
- err = dbuf_read_verify_dnode_crypt(db, flags);
- if (err != 0)
- goto early_unlock;
-
- DB_DNODE_EXIT(db);
-
db->db_state = DB_READ;
DTRACE_SET_STATE(db, "read issued");
mutex_exit(&db->db_mtx);
- if (dbuf_is_l2cacheable(db))
+ if (!DBUF_IS_CACHEABLE(db))
+ aflags |= ARC_FLAG_UNCACHED;
+ else if (dbuf_is_l2cacheable(db))
aflags |= ARC_FLAG_L2CACHE;
dbuf_add_ref(db, NULL);
@@ -1601,20 +1664,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW;
/*
- * The zio layer will copy the provided blkptr later, but we need to
- * do this now so that we can release the parent's rwlock. We have to
- * do that now so that if dbuf_read_done is called synchronously (on
+ * The zio layer will copy the provided blkptr later, but we have our
+ * own copy so that we can release the parent's rwlock. We have to
+ * do that so that if dbuf_read_done is called synchronously (on
* an l1 cache hit) we don't acquire the db_mtx while holding the
* parent's rwlock, which would be a lock ordering violation.
*/
- blkptr_t bp = *db->db_blkptr;
dmu_buf_unlock_parent(db, dblt, tag);
- (void) arc_read(zio, db->db_objset->os_spa, &bp,
+ return (arc_read(zio, db->db_objset->os_spa, bpp,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
- &aflags, &zb);
- return (err);
+ &aflags, &zb));
+
early_unlock:
- DB_DNODE_EXIT(db);
mutex_exit(&db->db_mtx);
dmu_buf_unlock_parent(db, dblt, tag);
return (err);
@@ -1661,7 +1722,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
+ memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
@@ -1691,7 +1752,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
} else {
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
- bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
} else {
db->db_buf = NULL;
dbuf_clear_data(db);
@@ -1699,38 +1760,65 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
}
int
-dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
{
- int err = 0;
- boolean_t prefetch;
dnode_t *dn;
+ boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
+ int err;
- /*
- * We don't have to hold the mutex to check db_state because it
- * can't be freed while we have a hold on the buffer.
- */
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- if (db->db_state == DB_NOFILL)
- return (SET_ERROR(EIO));
-
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
+ /*
+ * Ensure that this block's dnode has been decrypted if the caller
+ * has requested decrypted data.
+ */
+ err = dbuf_read_verify_dnode_crypt(db, dn, flags);
+ if (err != 0)
+ goto done;
+
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
- DBUF_IS_CACHEABLE(db);
+ (flags & DB_RF_NOPREFETCH) == 0;
mutex_enter(&db->db_mtx);
- if (db->db_state == DB_CACHED) {
- spa_t *spa = dn->dn_objset->os_spa;
+ if (flags & DB_RF_PARTIAL_FIRST)
+ db->db_partial_read = B_TRUE;
+ else if (!(flags & DB_RF_PARTIAL_MORE))
+ db->db_partial_read = B_FALSE;
+ miss = (db->db_state != DB_CACHED);
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
/*
- * Ensure that this block's dnode has been decrypted if
- * the caller has requested decrypted data.
+ * Another reader came in while the dbuf was in flight between
+ * UNCACHED and CACHED. Either a writer will finish filling
+ * the buffer, sending the dbuf to CACHED, or the first reader's
+ * request will reach the read_done callback and send the dbuf
+ * to CACHED. Otherwise, a failure occurred and the dbuf will
+ * be sent to UNCACHED.
*/
- err = dbuf_read_verify_dnode_crypt(db, flags);
+ if (flags & DB_RF_NEVERWAIT) {
+ mutex_exit(&db->db_mtx);
+ DB_DNODE_EXIT(db);
+ goto done;
+ }
+ do {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
+ zio_t *, pio);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ } while (db->db_state == DB_READ || db->db_state == DB_FILL);
+ if (db->db_state == DB_UNCACHED) {
+ err = SET_ERROR(EIO);
+ mutex_exit(&db->db_mtx);
+ DB_DNODE_EXIT(db);
+ goto done;
+ }
+ }
+ if (db->db_state == DB_CACHED) {
/*
* If the arc buf is compressed or encrypted and the caller
* requested uncompressed data, we need to untransform it
@@ -1738,11 +1826,11 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
* unauthenticated blocks, which will verify their MAC if
* the key is now available.
*/
- if (err == 0 && db->db_buf != NULL &&
- (flags & DB_RF_NO_DECRYPT) == 0 &&
+ if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
(arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ spa_t *spa = dn->dn_objset->os_spa;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
@@ -1752,80 +1840,49 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
- if (err == 0 && prefetch) {
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- B_FALSE, flags & DB_RF_HAVESTRUCT);
- }
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_hits);
- } else if (db->db_state == DB_UNCACHED) {
- spa_t *spa = dn->dn_objset->os_spa;
- boolean_t need_wait = B_FALSE;
-
+ } else {
+ ASSERT(db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
-
- if (zio == NULL &&
- db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ if (pio == NULL && (db->db_state == DB_NOFILL ||
+ (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
+ spa_t *spa = dn->dn_objset->os_spa;
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
need_wait = B_TRUE;
}
- err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
- /*
- * dbuf_read_impl has dropped db_mtx and our parent's rwlock
- * for us
- */
- if (!err && prefetch) {
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- db->db_state != DB_CACHED,
- flags & DB_RF_HAVESTRUCT);
- }
+ err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
+ /* dbuf_read_impl drops db_mtx and parent's rwlock. */
+ miss = (db->db_state != DB_CACHED);
+ }
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_misses);
+ if (err == 0 && prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
+ flags & DB_RF_HAVESTRUCT);
+ }
+ DB_DNODE_EXIT(db);
- /*
- * If we created a zio_root we must execute it to avoid
- * leaking it, even if it isn't attached to any work due
- * to an error in dbuf_read_impl().
- */
- if (need_wait) {
- if (err == 0)
- err = zio_wait(zio);
- else
- VERIFY0(zio_wait(zio));
- }
- } else {
- /*
- * Another reader came in while the dbuf was in flight
- * between UNCACHED and CACHED. Either a writer will finish
- * writing the buffer (sending the dbuf to CACHED) or the
- * first reader's request will reach the read_done callback
- * and send the dbuf to CACHED. Otherwise, a failure
- * occurred and the dbuf went to UNCACHED.
- */
- mutex_exit(&db->db_mtx);
- if (prefetch) {
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- B_TRUE, flags & DB_RF_HAVESTRUCT);
- }
- DB_DNODE_EXIT(db);
- DBUF_STAT_BUMP(hash_misses);
+ /*
+ * If we created a zio we must execute it to avoid leaking it, even if
+ * it isn't attached to any work due to an error in dbuf_read_impl().
+ */
+ if (need_wait) {
+ if (err == 0)
+ err = zio_wait(pio);
+ else
+ (void) zio_wait(pio);
+ pio = NULL;
+ }
- /* Skip the wait per the caller's request. */
- if ((flags & DB_RF_NEVERWAIT) == 0) {
- mutex_enter(&db->db_mtx);
- while (db->db_state == DB_READ ||
- db->db_state == DB_FILL) {
- ASSERT(db->db_state == DB_READ ||
- (flags & DB_RF_HAVESTRUCT) == 0);
- DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
- db, zio_t *, zio);
- cv_wait(&db->db_changed, &db->db_mtx);
- }
- if (db->db_state == DB_UNCACHED)
- err = SET_ERROR(EIO);
- mutex_exit(&db->db_mtx);
- }
+done:
+ if (miss)
+ DBUF_STAT_BUMP(hash_misses);
+ else
+ DBUF_STAT_BUMP(hash_hits);
+ if (pio && err != 0) {
+ zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ zio->io_error = err;
+ zio_nowait(zio);
}
return (err);
@@ -1879,8 +1936,13 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
+ if (dr->dt.dl.dr_brtwrite) {
+ ASSERT0P(dr->dt.dl.dr_data);
+ dr->dt.dl.dr_data = db->db_buf;
+ }
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
+ dr->dt.dl.dr_brtwrite = B_FALSE;
dr->dt.dl.dr_has_raw_params = B_FALSE;
/*
@@ -1891,7 +1953,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
* the buf thawed to save the effort of freezing &
* immediately re-thawing it.
*/
- arc_release(dr->dt.dl.dr_data, db);
+ if (dr->dt.dl.dr_data)
+ arc_release(dr->dt.dl.dr_data, db);
}
/*
@@ -1989,7 +2052,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
ASSERT(db->db.db_data != NULL);
arc_release(db->db_buf, db);
rw_enter(&db->db_rwlock, RW_WRITER);
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
arc_buf_freeze(db->db_buf);
}
@@ -1997,8 +2060,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
mutex_exit(&db->db_mtx);
}
- kmem_free(db_search, sizeof (dmu_buf_impl_t));
mutex_exit(&dn->dn_dbufs_mtx);
+ kmem_free(db_search, sizeof (dmu_buf_impl_t));
}
void
@@ -2026,10 +2089,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
/* copy old block data to the new block */
old_buf = db->db_buf;
- bcopy(old_buf->b_data, buf->b_data, MIN(osize, size));
+ memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
/* zero the remainder */
if (size > osize)
- bzero((uint8_t *)buf->b_data + osize, size - osize);
+ memset((uint8_t *)buf->b_data + osize, 0, size - osize);
mutex_enter(&db->db_mtx);
dbuf_set_data(db, buf);
@@ -2110,7 +2173,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
* Otherwise the buffer contents could be inconsistent between the
* dbuf and the lightweight dirty record.
*/
- ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+ ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
+ NULL));
mutex_enter(&dn->dn_mtx);
int txgoff = tx->tx_txg & TXG_MASK;
@@ -2262,7 +2326,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_blkid != DMU_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dmu_objset_willuse_space(os, db->db.db_size, tx);
}
@@ -2305,8 +2369,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
- if (db->db_blkid != DMU_BONUS_BLKID)
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dr->dr_accounted = db->db.db_size;
+ }
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2462,10 +2527,11 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
* Undirty a buffer in the transaction group referenced by the given
* transaction. Return whether this evicted the dbuf.
*/
-static boolean_t
+boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
+ boolean_t brtwrite;
ASSERT(txg != 0);
@@ -2490,6 +2556,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (B_FALSE);
ASSERT(dr->dr_dbuf == db);
+ brtwrite = dr->dt.dl.dr_brtwrite;
+ if (brtwrite) {
+ /*
+ * We are freeing a block that we cloned in the same
+ * transaction group.
+ */
+ brt_pending_remove(dmu_objset_spa(db->db_objset),
+ &dr->dt.dl.dr_overridden_by, tx);
+ }
+
dnode_t *dn = dr->dr_dnode;
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2519,7 +2595,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
}
- if (db->db_state != DB_NOFILL) {
+ if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr);
ASSERT(db->db_buf != NULL);
@@ -2534,7 +2610,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_dirtycnt -= 1;
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+ ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+ arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
}
@@ -2546,30 +2623,40 @@ static void
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ boolean_t undirty = B_FALSE;
ASSERT(tx->tx_txg != 0);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
/*
- * Quick check for dirtiness. For already dirty blocks, this
- * reduces runtime of this function by >90%, and overall performance
- * by 50% for some workloads (e.g. file deletion with indirect blocks
- * cached).
+ * Quick check for dirtiness to improve performance for some workloads
+ * (e.g. file deletion with indirect blocks cached).
*/
mutex_enter(&db->db_mtx);
-
- if (db->db_state == DB_CACHED) {
- dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+ if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
/*
- * It's possible that it is already dirty but not cached,
+ * It's possible that the dbuf is already dirty but not cached,
* because there are some calls to dbuf_dirty() that don't
* go through dmu_buf_will_dirty().
*/
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
if (dr != NULL) {
- /* This dbuf is already dirty and cached. */
- dbuf_redirty(dr);
- mutex_exit(&db->db_mtx);
- return;
+ if (db->db_level == 0 &&
+ dr->dt.dl.dr_brtwrite) {
+ /*
+ * Block cloning: If we are dirtying a cloned
+ * level 0 block, we cannot simply redirty it,
+ * because this dr has no associated data.
+ * We will go through a full undirtying below,
+ * before dirtying it again.
+ */
+ undirty = B_TRUE;
+ } else {
+ /* This dbuf is already dirty and cached. */
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return;
+ }
}
}
mutex_exit(&db->db_mtx);
@@ -2578,7 +2665,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
+
+ /*
+ * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
+ * want to make sure dbuf_read() will read the pending cloned block and
+ * not the uderlying block that is being replaced. dbuf_undirty() will
+ * do dbuf_unoverride(), so we will end up with cloned block content,
+ * without overridden BP.
+ */
(void) dbuf_read(db, NULL, flags);
+ if (undirty) {
+ mutex_enter(&db->db_mtx);
+ VERIFY(!dbuf_undirty(db, tx));
+ mutex_exit(&db->db_mtx);
+ }
(void) dbuf_dirty(db, tx);
}
@@ -2602,17 +2702,51 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
}
void
+dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ /*
+ * Block cloning: We are going to clone into this block, so undirty
+ * modifications done to this block so far in this txg. This includes
+ * writes and clones into this block.
+ */
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+ VERIFY(!dbuf_undirty(db, tx));
+ ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+ if (db->db_buf != NULL) {
+ arc_buf_destroy(db->db_buf, db);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ }
+
+ db->db_state = DB_NOFILL;
+ DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
+void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ mutex_enter(&db->db_mtx);
db->db_state = DB_NOFILL;
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
- dmu_buf_will_fill(db_fake, tx);
+ mutex_exit(&db->db_mtx);
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
}
void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -2624,6 +2758,25 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
dmu_tx_private_ok(tx));
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_NOFILL) {
+ /*
+ * Block cloning: We will be completely overwriting a block
+ * cloned in this transaction group, so let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done. But if the fill can fail
+ * we should have a way to return back to the cloned data.
+ */
+ if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+ mutex_exit(&db->db_mtx);
+ dmu_buf_will_dirty(db_fake, tx);
+ return;
+ }
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
+ }
+ mutex_exit(&db->db_mtx);
+
dbuf_noread(db);
(void) dbuf_dirty(db, tx);
}
@@ -2659,9 +2812,9 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
dr->dt.dl.dr_has_raw_params = B_TRUE;
dr->dt.dl.dr_byteorder = byteorder;
- bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
- bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
- bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
+ memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
+ memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
+ memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
}
static void
@@ -2671,39 +2824,50 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
dbuf_dirty_record_t *dr;
dr = list_head(&db->db_dirty_records);
+ ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dl = &dr->dt.dl;
dl->dr_overridden_by = *bp;
dl->dr_override_state = DR_OVERRIDDEN;
- dl->dr_overridden_by.blk_birth = dr->dr_txg;
+ BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
-/* ARGSUSED */
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
{
+ (void) tx;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
- dbuf_states_t old_state;
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
- old_state = db->db_state;
- db->db_state = DB_CACHED;
- if (old_state == DB_FILL) {
+ if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
db->db_freed_in_flight = FALSE;
+ db->db_state = DB_CACHED;
DTRACE_SET_STATE(db,
"fill done handling freed in flight");
+ failed = B_FALSE;
+ } else if (failed) {
+ VERIFY(!dbuf_undirty(db, tx));
+ arc_buf_destroy(db->db_buf, db);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ DTRACE_SET_STATE(db, "fill failed");
} else {
+ db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "fill done");
}
cv_broadcast(&db->db_changed);
+ } else {
+ db->db_state = DB_CACHED;
+ failed = B_FALSE;
}
mutex_exit(&db->db_mtx);
+ return (failed);
}
void
@@ -2732,6 +2896,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
dmu_buf_will_not_fill(dbuf, tx);
dr = list_head(&db->db_dirty_records);
+ ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dl = &dr->dt.dl;
encode_embedded_bp_compressed(&dl->dr_overridden_by,
@@ -2742,7 +2907,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
dl->dr_override_state = DR_OVERRIDDEN;
- dl->dr_overridden_by.blk_birth = dr->dr_txg;
+ BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
void
@@ -2793,7 +2958,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
if (db->db_state == DB_CACHED &&
zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
@@ -2806,7 +2972,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(!arc_is_encrypted(buf));
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
- bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+ memcpy(db->db.db_data, buf->b_data, db->db.db_size);
arc_buf_destroy(buf, db);
return;
}
@@ -2830,6 +2996,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
arc_buf_destroy(db->db_buf, db);
}
db->db_buf = NULL;
+ } else if (db->db_state == DB_NOFILL) {
+ /*
+ * We will be completely replacing the cloned block. In case
+ * it was cloned in this transaction group, let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done.
+ */
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
}
ASSERT(db->db_buf == NULL);
dbuf_set_data(db, buf);
@@ -2837,7 +3012,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
DTRACE_SET_STATE(db, "filling assigned arcbuf");
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
- dmu_buf_fill_done(&db->db, tx);
+ dmu_buf_fill_done(&db->db, tx, B_FALSE);
}
void
@@ -2873,6 +3048,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ ASSERT0(dmu_buf_user_size(&db->db));
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
@@ -2945,9 +3122,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT(!multilist_link_active(&db->db_cache_link));
- kmem_cache_free(dbuf_kmem_cache, db);
- arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
/*
* If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
@@ -2956,6 +3130,9 @@ dbuf_destroy(dmu_buf_impl_t *db)
mutex_enter(&parent->db_mtx);
dbuf_rele_and_unlock(parent, db, B_TRUE);
}
+
+ kmem_cache_free(dbuf_kmem_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
}
/*
@@ -3057,7 +3234,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
static dmu_buf_impl_t *
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
- dmu_buf_impl_t *parent, blkptr_t *blkptr)
+ dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
{
objset_t *os = dn->dn_objset;
dmu_buf_impl_t *db, *odb;
@@ -3078,6 +3255,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
+ db->db_hash = hash;
db->db_user = NULL;
db->db_user_immediate_evict = FALSE;
@@ -3161,6 +3339,7 @@ dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
if (err == 0) {
+ ASSERT3P(bp2, !=, NULL);
*bp = *bp2;
if (dbp != NULL)
dbuf_rele(dbp, NULL);
@@ -3189,8 +3368,10 @@ typedef struct dbuf_prefetch_arg {
static void
dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
{
- if (dpa->dpa_cb != NULL)
- dpa->dpa_cb(dpa->dpa_arg, io_done);
+ if (dpa->dpa_cb != NULL) {
+ dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
+ dpa->dpa_zb.zb_blkid, io_done);
+ }
kmem_free(dpa, sizeof (*dpa));
}
@@ -3198,11 +3379,13 @@ static void
dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
{
+ (void) zio, (void) zb, (void) iobp;
dbuf_prefetch_arg_t *dpa = private;
- dbuf_prefetch_fini(dpa, B_TRUE);
if (abuf != NULL)
arc_buf_destroy(abuf, private);
+
+ dbuf_prefetch_fini(dpa, B_TRUE);
}
/*
@@ -3246,6 +3429,7 @@ static void
dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
{
+ (void) zb, (void) iobp;
dbuf_prefetch_arg_t *dpa = private;
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
@@ -3253,7 +3437,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
if (abuf == NULL) {
ASSERT(zio == NULL || zio->io_error != 0);
- return (dbuf_prefetch_fini(dpa, B_TRUE));
+ dbuf_prefetch_fini(dpa, B_TRUE);
+ return;
}
ASSERT(zio == NULL || zio->io_error == 0);
@@ -3286,7 +3471,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dpa->dpa_curlevel, curblkid, FTAG);
if (db == NULL) {
arc_buf_destroy(abuf, private);
- return (dbuf_prefetch_fini(dpa, B_TRUE));
+ dbuf_prefetch_fini(dpa, B_TRUE);
+ return;
}
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
@@ -3299,12 +3485,14 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- ASSERT(!BP_IS_REDACTED(bp) ||
+ ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
dsl_dataset_feature_is_active(
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
- SPA_FEATURE_REDACTED_DATASETS));
+ SPA_FEATURE_REDACTED_DATASETS)));
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
+ arc_buf_destroy(abuf, private);
dbuf_prefetch_fini(dpa, B_TRUE);
+ return;
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
@@ -3322,7 +3510,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
- bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+ bp, dbuf_prefetch_indirect_done, dpa,
+ ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
@@ -3368,7 +3557,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
goto no_issue;
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
- level, blkid);
+ level, blkid, NULL);
if (db != NULL) {
mutex_exit(&db->db_mtx);
/*
@@ -3432,8 +3621,9 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
dpa->dpa_cb = cb;
dpa->dpa_arg = arg;
- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
- if (dnode_level_is_l2cacheable(&bp, dn, level))
+ if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
+ dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
+ else if (dnode_level_is_l2cacheable(&bp, dn, level))
dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
/*
@@ -3457,7 +3647,8 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, curlevel, curblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
- &bp, dbuf_prefetch_indirect_done, dpa, prio,
+ &bp, dbuf_prefetch_indirect_done, dpa,
+ ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
@@ -3469,7 +3660,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
return (1);
no_issue:
if (cb != NULL)
- cb(arg, B_FALSE);
+ cb(arg, level, blkid, B_FALSE);
return (0);
}
@@ -3518,7 +3709,7 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
}
rw_enter(&db->db_rwlock, RW_WRITER);
- bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+ memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
rw_exit(&db->db_rwlock);
}
@@ -3529,9 +3720,10 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
- void *tag, dmu_buf_impl_t **dbp)
+ const void *tag, dmu_buf_impl_t **dbp)
{
dmu_buf_impl_t *db, *parent = NULL;
+ uint64_t hv;
/* If the pool has been created, verify the tx_sync_lock is not held */
spa_t *spa = dn->dn_objset->os_spa;
@@ -3547,7 +3739,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
*dbp = NULL;
/* dbuf_find() returns with db_mtx held */
- db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
if (db == NULL) {
blkptr_t *bp = NULL;
@@ -3569,7 +3761,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
}
if (err && err != ENOENT)
return (err);
- db = dbuf_create(dn, level, blkid, parent, bp);
+ db = dbuf_create(dn, level, blkid, parent, bp, hv);
}
if (fail_uncached && db->db_state != DB_CACHED) {
@@ -3593,8 +3785,10 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
- if (dr->dt.dl.dr_data == db->db_buf)
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ ASSERT3P(db->db_buf, !=, NULL);
dbuf_hold_copy(dn, db);
+ }
}
if (multilist_link_active(&db->db_cache_link)) {
@@ -3603,9 +3797,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ uint64_t size = db->db.db_size;
+ uint64_t usize = dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[db->db_caching_status].size,
- db->db.db_size, db);
+ &dbuf_caches[db->db_caching_status].size, size, db);
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size, usize,
+ db->db_user);
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
@@ -3613,7 +3812,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ size + usize);
}
db->db_caching_status = DB_NO_CACHE;
}
@@ -3634,13 +3833,13 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
}
dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
{
return (dbuf_hold_level(dn, 0, blkid, tag));
}
dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
{
dmu_buf_impl_t *db;
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
@@ -3653,7 +3852,8 @@ dbuf_create_bonus(dnode_t *dn)
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
+ dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
}
int
@@ -3681,7 +3881,7 @@ dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
#pragma weak dmu_buf_add_ref = dbuf_add_ref
void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
{
int64_t holds = zfs_refcount_add(&db->db_holds, tag);
VERIFY3S(holds, >, 1);
@@ -3690,7 +3890,7 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
boolean_t
dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
- void *tag)
+ const void *tag)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dmu_buf_impl_t *found_db;
@@ -3699,7 +3899,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
if (blkid == DMU_BONUS_BLKID)
found_db = dbuf_find_bonus(os, obj);
else
- found_db = dbuf_find(os, obj, 0, blkid);
+ found_db = dbuf_find(os, obj, 0, blkid, NULL);
if (found_db != NULL) {
if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
@@ -3719,14 +3919,14 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
* dnode's parent dbuf evicting its dnode handles.
*/
void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
+dbuf_rele(dmu_buf_impl_t *db, const void *tag)
{
mutex_enter(&db->db_mtx);
dbuf_rele_and_unlock(db, tag, B_FALSE);
}
void
-dmu_buf_rele(dmu_buf_t *db, void *tag)
+dmu_buf_rele(dmu_buf_t *db, const void *tag)
{
dbuf_rele((dmu_buf_impl_t *)db, tag);
}
@@ -3745,7 +3945,7 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
*
*/
void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
{
int64_t holds;
uint64_t size;
@@ -3819,59 +4019,41 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
* This dbuf has anonymous data associated with it.
*/
dbuf_destroy(db);
- } else {
- boolean_t do_arc_evict = B_FALSE;
- blkptr_t bp;
- spa_t *spa = dmu_objset_spa(db->db_objset);
-
- if (!DBUF_IS_CACHEABLE(db) &&
- db->db_blkptr != NULL &&
- !BP_IS_HOLE(db->db_blkptr) &&
- !BP_IS_EMBEDDED(db->db_blkptr)) {
- do_arc_evict = B_TRUE;
- bp = *db->db_blkptr;
- }
-
- if (!DBUF_IS_CACHEABLE(db) ||
- db->db_pending_evict) {
- dbuf_destroy(db);
- } else if (!multilist_link_active(&db->db_cache_link)) {
- ASSERT3U(db->db_caching_status, ==,
- DB_NO_CACHE);
-
- dbuf_cached_state_t dcs =
- dbuf_include_in_metadata_cache(db) ?
- DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
- db->db_caching_status = dcs;
-
- multilist_insert(&dbuf_caches[dcs].cache, db);
- uint64_t db_size = db->db.db_size;
- size = zfs_refcount_add_many(
- &dbuf_caches[dcs].size, db_size, db);
- uint8_t db_level = db->db_level;
- mutex_exit(&db->db_mtx);
-
- if (dcs == DB_DBUF_METADATA_CACHE) {
- DBUF_STAT_BUMP(metadata_cache_count);
- DBUF_STAT_MAX(
- metadata_cache_size_bytes_max,
- size);
- } else {
- DBUF_STAT_BUMP(cache_count);
- DBUF_STAT_MAX(cache_size_bytes_max,
- size);
- DBUF_STAT_BUMP(cache_levels[db_level]);
- DBUF_STAT_INCR(
- cache_levels_bytes[db_level],
- db_size);
- }
+ } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
+ db->db_pending_evict) {
+ dbuf_destroy(db);
+ } else if (!multilist_link_active(&db->db_cache_link)) {
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+
+ dbuf_cached_state_t dcs =
+ dbuf_include_in_metadata_cache(db) ?
+ DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+ db->db_caching_status = dcs;
+
+ multilist_insert(&dbuf_caches[dcs].cache, db);
+ uint64_t db_size = db->db.db_size;
+ uint64_t dbu_size = dmu_buf_user_size(&db->db);
+ (void) zfs_refcount_add_many(
+ &dbuf_caches[dcs].size, db_size, db);
+ size = zfs_refcount_add_many(
+ &dbuf_caches[dcs].size, dbu_size, db->db_user);
+ uint8_t db_level = db->db_level;
+ mutex_exit(&db->db_mtx);
- if (dcs == DB_DBUF_CACHE && !evicting)
- dbuf_evict_notify(size);
+ if (dcs == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMP(metadata_cache_count);
+ DBUF_STAT_MAX(metadata_cache_size_bytes_max,
+ size);
+ } else {
+ DBUF_STAT_BUMP(cache_count);
+ DBUF_STAT_MAX(cache_size_bytes_max, size);
+ DBUF_STAT_BUMP(cache_levels[db_level]);
+ DBUF_STAT_INCR(cache_levels_bytes[db_level],
+ db_size + dbu_size);
}
- if (do_arc_evict)
- arc_freed(spa, &bp);
+ if (dcs == DB_DBUF_CACHE && !evicting)
+ dbuf_evict_notify(size);
}
} else {
mutex_exit(&db->db_mtx);
@@ -3948,8 +4130,37 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
return (db->db_user);
}
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ if (db->db_user == NULL)
+ return (0);
+ return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+ atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
void
-dmu_buf_user_evict_wait()
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+ atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
+void
+dmu_buf_user_evict_wait(void)
{
taskq_wait(dbu_evict_taskq);
}
@@ -3968,21 +4179,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
return (dbi->db_objset);
}
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_ENTER(dbi);
- return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_EXIT(dbi);
-}
-
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
@@ -4042,7 +4238,7 @@ dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dnode_t *dn = dr->dr_dnode;
ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
- bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
+ memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
dbuf_sync_leaf_verify_bonus_dnode(dr);
@@ -4244,22 +4440,6 @@ dbuf_lightweight_ready(zio_t *zio)
}
static void
-dbuf_lightweight_physdone(zio_t *zio)
-{
- dbuf_dirty_record_t *dr = zio->io_private;
- dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
- ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
- /*
- * The callback will be called io_phys_children times. Retire one
- * portion of our dirty space each time we are called. Any rounding
- * error will be cleaned up by dbuf_lightweight_done().
- */
- int delta = dr->dr_accounted / zio->io_phys_children;
- dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-static void
dbuf_lightweight_done(zio_t *zio)
{
dbuf_dirty_record_t *dr = zio->io_private;
@@ -4277,16 +4457,8 @@ dbuf_lightweight_done(zio_t *zio)
dsl_dataset_block_born(ds, zio->io_bp, tx);
}
- /*
- * See comment in dbuf_write_done().
- */
- if (zio->io_phys_children == 0) {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted, zio->io_txg);
- } else {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted % zio->io_phys_children, zio->io_txg);
- }
+ dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+ zio->io_txg);
abd_free(dr->dt.dll.dr_abd);
kmem_free(dr, sizeof (*dr));
@@ -4320,8 +4492,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
- dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
- ZIO_PRIORITY_ASYNC_WRITE,
+ dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
zio_nowait(dr->dr_zio);
@@ -4356,6 +4527,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else if (db->db_state == DB_READ) {
+ /*
+ * This buffer has a clone we need to write, and an in-flight
+ * read on the BP we're about to clone. Its safe to issue the
+ * write here because the read has already been issued and the
+ * contents won't change.
+ */
+ ASSERT(dr->dt.dl.dr_brtwrite &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
@@ -4412,7 +4592,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
/*
@@ -4422,11 +4601,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
dbuf_prepare_encrypted_dnode_leaf(dr);
- if (db->db_state != DB_NOFILL &&
+ if (*datap != NULL && *datap == db->db_buf &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
zfs_refcount_count(&db->db_holds) > 1 &&
- dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
- *datap == db->db_buf) {
+ dr->dt.dl.dr_override_state != DR_OVERRIDDEN) {
/*
* If this buffer is currently "in use" (i.e., there
* are active holds and db_data still references it),
@@ -4462,7 +4640,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
} else {
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
- bcopy(db->db.db_data, (*datap)->b_data, psize);
+ memcpy((*datap)->b_data, db->db.db_data, psize);
}
db->db_data_pending = dr;
@@ -4478,6 +4656,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
}
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs. May be
+ * called recursively from dbuf_sync_indirect().
+ */
void
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
@@ -4512,10 +4694,10 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
}
}
-/* ARGSUSED */
static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
+ (void) buf;
dmu_buf_impl_t *db = vdb;
dnode_t *dn;
blkptr_t *bp = zio->io_bp;
@@ -4534,7 +4716,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
- if (bp->blk_birth != 0) {
+ if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
@@ -4571,6 +4753,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
i += DNODE_MIN_SIZE;
if (dnp->dn_type != DMU_OT_NONE) {
fill++;
+ for (int j = 0; j < dnp->dn_nblkptr;
+ j++) {
+ (void) zfs_blkptr_verify(spa,
+ &dnp->dn_blkptr[j],
+ BLK_CONFIG_SKIP,
+ BLK_VERIFY_HALT);
+ }
+ if (dnp->dn_flags &
+ DNODE_FLAG_SPILL_BLKPTR) {
+ (void) zfs_blkptr_verify(spa,
+ DN_SPILL_BLKPTR(dnp),
+ BLK_CONFIG_SKIP,
+ BLK_VERIFY_HALT);
+ }
i += dnp->dn_extra_slots *
DNODE_MIN_SIZE;
}
@@ -4588,6 +4784,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
+ (void) zfs_blkptr_verify(spa, ibp,
+ BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
fill += BP_GET_FILL(ibp);
}
}
@@ -4603,7 +4801,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dmu_buf_unlock_parent(db, dblt, FTAG);
}
-/* ARGSUSED */
/*
* This function gets called just prior to running through the compression
* stage of the zio pipeline. If we're an indirect block comprised of only
@@ -4614,6 +4811,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
static void
dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
+ (void) zio, (void) buf;
dmu_buf_impl_t *db = vdb;
dnode_t *dn;
blkptr_t *bp;
@@ -4642,47 +4840,16 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
* zero out.
*/
rw_enter(&db->db_rwlock, RW_WRITER);
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);
}
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times). This
- * allows the DMU to monitor the progress of each logical i/o. For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block. There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-/* ARGSUSED */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
- dmu_buf_impl_t *db = arg;
- objset_t *os = db->db_objset;
- dsl_pool_t *dp = dmu_objset_pool(os);
- dbuf_dirty_record_t *dr;
- int delta = 0;
-
- dr = db->db_data_pending;
- ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
- /*
- * The callback will be called io_phys_children times. Retire one
- * portion of our dirty space each time we are called. Any rounding
- * error will be cleaned up by dbuf_write_done().
- */
- delta = dr->dr_accounted / zio->io_phys_children;
- dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-/* ARGSUSED */
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
+ (void) buf;
dmu_buf_impl_t *db = vdb;
blkptr_t *bp_orig = &zio->io_bp_orig;
blkptr_t *bp = db->db_blkptr;
@@ -4726,9 +4893,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (db->db_level == 0) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
- if (db->db_state != DB_NOFILL) {
- if (dr->dt.dl.dr_data != db->db_buf)
- arc_buf_destroy(dr->dt.dl.dr_data, db);
+ if (dr->dt.dl.dr_data != NULL &&
+ dr->dt.dl.dr_data != db->db_buf) {
+ arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -4751,27 +4918,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
- /*
- * If we didn't do a physical write in this ZIO and we
- * still ended up here, it means that the space of the
- * dbuf that we just released (and undirtied) above hasn't
- * been marked as undirtied in the pool's accounting.
- *
- * Thus, we undirty that space in the pool's view of the
- * world here. For physical writes this type of update
- * happens in dbuf_write_physdone().
- *
- * If we did a physical write, cleanup any rounding errors
- * that came up due to writing multiple copies of a block
- * on disk [see dbuf_write_physdone()].
- */
- if (zio->io_phys_children == 0) {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted, zio->io_txg);
- } else {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted % zio->io_phys_children, zio->io_txg);
- }
+ dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+ zio->io_txg);
kmem_free(dr, sizeof (dbuf_dirty_record_t));
}
@@ -4853,7 +5001,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
drica.drica_os = dn->dn_objset;
- drica.drica_blk_birth = bp->blk_birth;
+ drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
drica.drica_tx = tx;
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
@@ -4868,7 +5016,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
if (dn->dn_objset != spa_meta_objset(spa)) {
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+ BP_GET_LOGICAL_BIRTH(bp) >
+ ds->ds_dir->dd_origin_txg) {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@@ -4928,7 +5077,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
}
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
@@ -4946,21 +5098,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
os = dn->dn_objset;
- if (db->db_state != DB_NOFILL) {
- if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- if (BP_IS_HOLE(db->db_blkptr)) {
- arc_buf_thaw(data);
- } else {
- dbuf_release_bp(db);
- }
- dbuf_remap(dn, db, tx);
- }
+ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+ /*
+ * Private object buffers are released here rather than in
+ * dbuf_dirty() since they are only modified in the syncing
+ * context and we don't want the overhead of making multiple
+ * copies of the data.
+ */
+ if (BP_IS_HOLE(db->db_blkptr))
+ arc_buf_thaw(data);
+ else
+ dbuf_release_bp(db);
+ dbuf_remap(dn, db, tx);
}
if (parent != dn->dn_dbuf) {
@@ -4987,7 +5136,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
}
ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
ASSERT(pio);
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
@@ -4996,7 +5145,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
if (db->db_blkid == DMU_SPILL_BLKID)
wp_flag = WP_SPILL;
- wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+ wp_flag |= (data == NULL) ? WP_NOFILL : 0;
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
@@ -5019,20 +5168,21 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_override_ready, NULL, NULL,
+ dbuf_write_override_ready, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
- dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+ dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
- } else if (db->db_state == DB_NOFILL) {
+ } else if (data == NULL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_nofill_ready, NULL, NULL,
+ dbuf_write_nofill_ready, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -5049,11 +5199,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
children_ready_cb = dbuf_write_children_ready;
dr->dr_zio = arc_write(pio, os->os_spa, txg,
- &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db),
- &zp, dbuf_write_ready,
- children_ready_cb, dbuf_write_physdone,
- dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED, &zb);
+ &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
+ dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+ children_ready_cb, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
@@ -5071,6 +5220,7 @@ EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_clone);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
EXPORT_SYMBOL(dmu_buf_fill_done);
@@ -5093,25 +5243,23 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie);
EXPORT_SYMBOL(dmu_buf_get_user);
EXPORT_SYMBOL(dmu_buf_get_blkptr);
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
"Maximum size in bytes of the dbuf cache.");
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
- "Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
- "directly.");
+ "Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
- "Percentage below dbuf_cache_max_bytes when the evict thread stops "
- "evicting dbufs.");
+ "Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
+ "Maximum size in bytes of dbuf metadata cache.");
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
- "Maximum size in bytes of the dbuf metadata cache.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
+ "Set size of dbuf cache to log2 fraction of arc size.");
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
- "Set the size of the dbuf cache to a log2 fraction of arc size.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
+ "Set size of dbuf metadata cache to log2 fraction of arc size.");
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
- "Set the size of the dbuf metadata cache to a log2 fraction of arc "
- "size.");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
+ "Set size of dbuf cache mutex array as log2 shift.");
diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
index 12bb568a08cc..ccee8997e10e 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf_stats.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -46,14 +46,14 @@ static int
dbuf_stats_hash_table_headers(char *buf, size_t size)
{
(void) snprintf(buf, size,
- "%-96s | %-119s | %s\n"
- "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
+ "%-105s | %-119s | %s\n"
+ "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
"%-5s %-5s %-9s %-6s %-8s %-12s "
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
"%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
- "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
- "list", "atype", "flags", "count", "asize", "access",
+ "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
+ "dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize");
@@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
__dmu_object_info_from_dnode(dn, &doi);
nwritten = snprintf(buf, size,
- "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
- "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
+ "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
+ "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
/* dmu_buf_impl_t */
@@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
(longlong_t)db->db_blkid,
(u_longlong_t)db->db.db_offset,
(u_longlong_t)db->db.db_size,
+ (u_longlong_t)dmu_buf_user_size(&db->db),
!!dbuf_is_metadata(db),
db->db_state,
(ulong_t)zfs_refcount_count(&db->db_holds),
@@ -226,7 +227,5 @@ dbuf_stats_destroy(void)
dbuf_stats_hash_table_destroy();
}
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW,
"Calculate arc header index");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
index fe5a188f4da1..4c53cb0a2f9b 100644
--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -22,6 +22,8 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
+ * Copyright (c) 2023, Klara Inc.
*/
#include <sys/zfs_context.h>
@@ -29,15 +31,119 @@
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/ddt.h>
+#include <sys/ddt_impl.h>
#include <sys/zap.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
#include <sys/dsl_pool.h>
#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
#include <sys/dsl_scan.h>
#include <sys/abd.h>
+/*
+ * # DDT: Deduplication tables
+ *
+ * The dedup subsystem provides block-level deduplication. When enabled, blocks
+ * to be written will have the dedup (D) bit set, which causes them to be
+ * tracked in a "dedup table", or DDT. If a block has been seen before (exists
+ * in the DDT), instead of being written, it will instead be made to reference
+ * the existing on-disk data, and a refcount bumped in the DDT instead.
+ *
+ * ## Dedup tables and entries
+ *
+ * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
+ * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
+ * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
+ * time and refcount. Together these are enough to track references to a
+ * specific block, to build a valid block pointer to reference that block (for
+ * freeing, scrubbing, etc), and to fill a new block pointer with the missing
+ * pieces to make it seem like it was written.
+ *
+ * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
+ * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
+ * object data formats, each with their own implementations) and "classes"
+ * (ddt_class_t, instance of a storage type object, for entries with a specific
+ * characteristic). An entry (key) will only ever exist on one of these objects
+ * at any given time, but may be moved from one to another if their type or
+ * class changes.
+ *
+ * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
+ * is to be written, before DVAs have been allocated, ddt_lookup() is called to
+ * see if the block has been seen before. If its not found, the write proceeds
+ * as normal, and after it succeeds, a new entry is created. If it is found, we
+ * fill the BP with the DVAs from the entry, increment the refcount and cause
+ * the write IO to return immediately.
+ *
+ * Each ddt_phys_t slot in the entry represents a separate dedup block for the
+ * same content/checksum. The slot is selected based on the zp_copies parameter
+ * the block is written with, that is, the number of DVAs in the block. The
+ * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
+ * feature. These are no longer written, and will be freed if encountered on
+ * old pools.
+ *
+ * ## Lifetime of an entry
+ *
+ * A DDT can be enormous, and typically is not held in memory all at once.
+ * Instead, the changes to an entry are tracked in memory, and written down to
+ * disk at the end of each txg.
+ *
+ * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
+ * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is
+ * required for IO, ddt_lookup() is called. If an entry already exists on
+ * ddt_tree, it is returned. Otherwise, a new one is created, and the
+ * type/class objects for the DDT are searched for that key. If its found, its
+ * value is copied into the live entry. If not, an empty entry is created.
+ *
+ * The live entry will be modified during the txg, usually by modifying the
+ * refcount, but sometimes by adding or updating DVAs. At the end of the txg
+ * (during spa_sync()), type and class are recalculated for entry (see
+ * ddt_sync_entry()), and the entry is written to the appropriate storage
+ * object and (if necessary), removed from an old one. ddt_tree is cleared and
+ * the next txg can start.
+ *
+ * ## Repair IO
+ *
+ * If a read on a dedup block fails, but there are other copies of the block in
+ * the other ddt_phys_t slots, reads will be issued for those instead
+ * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
+ * the caller, and a copy is stashed on the entry's dde_repair_abd.
+ *
+ * During the end-of-txg sync, any entries with a dde_repair_abd get a
+ * "rewrite" write issued for the original block pointer, with the data read
+ * from the alternate block. If the block is actually damaged, this will invoke
+ * the pool's "self-healing" mechanism, and repair the block.
+ *
+ * ## Scanning (scrub/resilver)
+ *
+ * If dedup is active, the scrub machinery will walk the dedup table first, and
+ * scrub all blocks with refcnt > 1 first. After that it will move on to the
+ * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
+ * In this way, heavily deduplicated blocks are only scrubbed once. See the
+ * commentary on dsl_scan_ddt() for more details.
+ *
+ * Walking the DDT is done via ddt_walk(). The current position is stored in a
+ * ddt_bookmark_t, which represents a stable position in the storage object.
+ * This bookmark is stored by the scan machinery, and must reference the same
+ * position on the object even if the object changes, the pool is exported, or
+ * OpenZFS is upgraded.
+ *
+ * ## Interaction with block cloning
+ *
+ * If block cloning and dedup are both enabled on a pool, BRT will look for the
+ * dedup bit on an incoming block pointer. If set, it will call into the DDT
+ * (ddt_addref()) to add a reference to the block, instead of adding a
+ * reference to the BRT. See brt_pending_apply().
+ */
+
+/*
+ * These are the only checksums valid for dedup. They must match the list
+ * from dedup_table in zfs_prop.c
+ */
+#define DDT_CHECKSUM_VALID(c) \
+ (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
+ c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
+ c == ZIO_CHECKSUM_BLAKE3)
+
static kmem_cache_t *ddt_cache;
static kmem_cache_t *ddt_entry_cache;
@@ -46,18 +152,18 @@ static kmem_cache_t *ddt_entry_cache;
*/
int zfs_dedup_prefetch = 0;
-static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
&ddt_zap_ops,
};
-static const char *ddt_class_name[DDT_CLASSES] = {
+static const char *const ddt_class_name[DDT_CLASSES] = {
"ditto",
"duplicate",
"unique",
};
static void
-ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_tx_t *tx)
{
spa_t *spa = ddt->ddt_spa;
@@ -69,20 +175,20 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_object_name(ddt, type, class, name);
- ASSERT(*objectp == 0);
- VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
- ASSERT(*objectp != 0);
+ ASSERT3U(*objectp, ==, 0);
+ VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
+ ASSERT3U(*objectp, !=, 0);
- VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
- sizeof (uint64_t), 1, objectp, tx) == 0);
+ VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx));
- VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
- &ddt->ddt_histogram[type][class], tx) == 0);
+ &ddt->ddt_histogram[type][class], tx));
}
static void
-ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_tx_t *tx)
{
spa_t *spa = ddt->ddt_spa;
@@ -93,19 +199,20 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_object_name(ddt, type, class, name);
- ASSERT(*objectp != 0);
+ ASSERT3U(*objectp, !=, 0);
ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
- VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
- VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
- VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
- VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
- bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+ VERIFY0(ddt_object_count(ddt, type, class, &count));
+ VERIFY0(count);
+ VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+ VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
+ VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
+ memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
*objectp = 0;
}
static int
-ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
{
ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
dmu_object_info_t doi;
@@ -145,7 +252,7 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
}
static void
-ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_tx_t *tx)
{
ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
@@ -155,75 +262,95 @@ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
ddt_object_name(ddt, type, class, name);
- VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
- &ddt->ddt_histogram[type][class], tx) == 0);
+ &ddt->ddt_histogram[type][class], tx));
/*
* Cache DDT statistics; this is the only time they'll change.
*/
- VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
- VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
+ VERIFY0(ddt_object_info(ddt, type, class, &doi));
+ VERIFY0(ddt_object_count(ddt, type, class, &count));
ddo->ddo_count = count;
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
}
+static boolean_t
+ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
+{
+ return (!!ddt->ddt_object[type][class]);
+}
+
static int
-ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
ddt_entry_t *dde)
{
if (!ddt_object_exists(ddt, type, class))
return (SET_ERROR(ENOENT));
return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
- ddt->ddt_object[type][class], dde));
+ ddt->ddt_object[type][class], &dde->dde_key,
+ dde->dde_phys, sizeof (dde->dde_phys)));
+}
+
+static int
+ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
+ const ddt_key_t *ddk)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (SET_ERROR(ENOENT));
+
+ return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os,
+ ddt->ddt_object[type][class], ddk));
}
static void
-ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- ddt_entry_t *dde)
+ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
+ const ddt_key_t *ddk)
{
if (!ddt_object_exists(ddt, type, class))
return;
ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
- ddt->ddt_object[type][class], dde);
+ ddt->ddt_object[type][class], ddk);
}
-int
-ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+static int
+ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
ddt_entry_t *dde, dmu_tx_t *tx)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
- ddt->ddt_object[type][class], dde, tx));
+ ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
+ sizeof (dde->dde_phys), tx));
}
static int
-ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- ddt_entry_t *dde, dmu_tx_t *tx)
+ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
+ const ddt_key_t *ddk, dmu_tx_t *tx)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
- ddt->ddt_object[type][class], dde, tx));
+ ddt->ddt_object[type][class], ddk, tx));
}
int
-ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
uint64_t *walk, ddt_entry_t *dde)
{
ASSERT(ddt_object_exists(ddt, type, class));
return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
- ddt->ddt_object[type][class], dde, walk));
+ ddt->ddt_object[type][class], walk, &dde->dde_key,
+ dde->dde_phys, sizeof (dde->dde_phys)));
}
int
-ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
uint64_t *count)
{
ASSERT(ddt_object_exists(ddt, type, class));
@@ -233,7 +360,7 @@ ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
}
int
-ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_object_info_t *doi)
{
if (!ddt_object_exists(ddt, type, class))
@@ -243,14 +370,8 @@ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
doi));
}
-boolean_t
-ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
-{
- return (!!ddt->ddt_object[type][class]);
-}
-
void
-ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
char *name)
{
(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
@@ -261,7 +382,7 @@ ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
void
ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
{
- ASSERT(txg != 0);
+ ASSERT3U(txg, !=, 0);
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
bp->blk_dva[d] = ddp->ddp_dva[d];
@@ -312,17 +433,17 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
void
ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
{
- ASSERT(ddp->ddp_phys_birth == 0);
+ ASSERT0(ddp->ddp_phys_birth);
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
ddp->ddp_dva[d] = bp->blk_dva[d];
- ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+ ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
}
void
ddt_phys_clear(ddt_phys_t *ddp)
{
- bzero(ddp, sizeof (*ddp));
+ memset(ddp, 0, sizeof (*ddp));
}
void
@@ -335,12 +456,12 @@ void
ddt_phys_decref(ddt_phys_t *ddp)
{
if (ddp) {
- ASSERT(ddp->ddp_refcnt > 0);
+ ASSERT3U(ddp->ddp_refcnt, >, 0);
ddp->ddp_refcnt--;
}
}
-void
+static void
ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
{
blkptr_t blk;
@@ -364,7 +485,7 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
- BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
return (ddp);
}
return (NULL);
@@ -381,221 +502,10 @@ ddt_phys_total_refcnt(const ddt_entry_t *dde)
return (refcnt);
}
-static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
-{
- spa_t *spa = ddt->ddt_spa;
- ddt_phys_t *ddp = dde->dde_phys;
- ddt_key_t *ddk = &dde->dde_key;
- uint64_t lsize = DDK_GET_LSIZE(ddk);
- uint64_t psize = DDK_GET_PSIZE(ddk);
-
- bzero(dds, sizeof (*dds));
-
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
- uint64_t dsize = 0;
- uint64_t refcnt = ddp->ddp_refcnt;
-
- if (ddp->ddp_phys_birth == 0)
- continue;
-
- for (int d = 0; d < DDE_GET_NDVAS(dde); d++)
- dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
-
- dds->dds_blocks += 1;
- dds->dds_lsize += lsize;
- dds->dds_psize += psize;
- dds->dds_dsize += dsize;
-
- dds->dds_ref_blocks += refcnt;
- dds->dds_ref_lsize += lsize * refcnt;
- dds->dds_ref_psize += psize * refcnt;
- dds->dds_ref_dsize += dsize * refcnt;
- }
-}
-
-void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
-{
- const uint64_t *s = (const uint64_t *)src;
- uint64_t *d = (uint64_t *)dst;
- uint64_t *d_end = (uint64_t *)(dst + 1);
-
- ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
-
- for (int i = 0; i < d_end - d; i++)
- d[i] += (s[i] ^ neg) - neg;
-}
-
-static void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
-{
- ddt_stat_t dds;
- ddt_histogram_t *ddh;
- int bucket;
-
- ddt_stat_generate(ddt, dde, &dds);
-
- bucket = highbit64(dds.dds_ref_blocks) - 1;
- ASSERT(bucket >= 0);
-
- ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
-
- ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
-}
-
-void
-ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
-{
- for (int h = 0; h < 64; h++)
- ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
-}
-
-void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
-{
- bzero(dds, sizeof (*dds));
-
- for (int h = 0; h < 64; h++)
- ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
-}
-
-boolean_t
-ddt_histogram_empty(const ddt_histogram_t *ddh)
-{
- const uint64_t *s = (const uint64_t *)ddh;
- const uint64_t *s_end = (const uint64_t *)(ddh + 1);
-
- while (s < s_end)
- if (*s++ != 0)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-void
-ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
-{
- /* Sum the statistics we cached in ddt_object_sync(). */
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
- ddt_t *ddt = spa->spa_ddt[c];
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++) {
- ddt_object_t *ddo =
- &ddt->ddt_object_stats[type][class];
- ddo_total->ddo_count += ddo->ddo_count;
- ddo_total->ddo_dspace += ddo->ddo_dspace;
- ddo_total->ddo_mspace += ddo->ddo_mspace;
- }
- }
- }
-
- /* ... and compute the averages. */
- if (ddo_total->ddo_count != 0) {
- ddo_total->ddo_dspace /= ddo_total->ddo_count;
- ddo_total->ddo_mspace /= ddo_total->ddo_count;
- }
-}
-
-void
-ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
-{
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
- ddt_t *ddt = spa->spa_ddt[c];
- for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++) {
- ddt_histogram_add(ddh,
- &ddt->ddt_histogram_cache[type][class]);
- }
- }
- }
-}
-
-void
-ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
-{
- ddt_histogram_t *ddh_total;
-
- ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
- ddt_get_dedup_histogram(spa, ddh_total);
- ddt_histogram_stat(dds_total, ddh_total);
- kmem_free(ddh_total, sizeof (ddt_histogram_t));
-}
-
-uint64_t
-ddt_get_dedup_dspace(spa_t *spa)
-{
- ddt_stat_t dds_total;
-
- if (spa->spa_dedup_dspace != ~0ULL)
- return (spa->spa_dedup_dspace);
-
- bzero(&dds_total, sizeof (ddt_stat_t));
-
- /* Calculate and cache the stats */
- ddt_get_dedup_stats(spa, &dds_total);
- spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
- return (spa->spa_dedup_dspace);
-}
-
-uint64_t
-ddt_get_pool_dedup_ratio(spa_t *spa)
-{
- ddt_stat_t dds_total = { 0 };
-
- ddt_get_dedup_stats(spa, &dds_total);
- if (dds_total.dds_dsize == 0)
- return (100);
-
- return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
-}
-
-size_t
-ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
-{
- uchar_t *version = dst++;
- int cpfunc = ZIO_COMPRESS_ZLE;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
- size_t c_len;
-
- ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
-
- c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
-
- if (c_len == s_len) {
- cpfunc = ZIO_COMPRESS_OFF;
- bcopy(src, dst, s_len);
- }
-
- *version = cpfunc;
- if (ZFS_HOST_BYTEORDER)
- *version |= DDT_COMPRESS_BYTEORDER_MASK;
-
- return (c_len + 1);
-}
-
-void
-ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
-{
- uchar_t version = *src++;
- int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-
- if (ci->ci_decompress != NULL)
- (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
- else
- bcopy(src, dst, d_len);
-
- if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
- (ZFS_HOST_BYTEORDER != 0))
- byteswap_uint64_array(dst, d_len);
-}
-
ddt_t *
ddt_select(spa_t *spa, const blkptr_t *bp)
{
+ ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
}
@@ -633,7 +543,7 @@ ddt_alloc(const ddt_key_t *ddk)
ddt_entry_t *dde;
dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
- bzero(dde, sizeof (ddt_entry_t));
+ memset(dde, 0, sizeof (ddt_entry_t));
cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
dde->dde_key = *ddk;
@@ -644,10 +554,10 @@ ddt_alloc(const ddt_key_t *ddk)
static void
ddt_free(ddt_entry_t *dde)
{
- ASSERT(!dde->dde_loading);
+ ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
for (int p = 0; p < DDT_PHYS_TYPES; p++)
- ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
if (dde->dde_repair_abd != NULL)
abd_free(dde->dde_repair_abd);
@@ -668,36 +578,48 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
ddt_entry_t *
ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
{
- ddt_entry_t *dde, dde_search;
- enum ddt_type type;
- enum ddt_class class;
+ ddt_key_t search;
+ ddt_entry_t *dde;
+ ddt_type_t type;
+ ddt_class_t class;
avl_index_t where;
int error;
ASSERT(MUTEX_HELD(&ddt->ddt_lock));
- ddt_key_fill(&dde_search.dde_key, bp);
+ ddt_key_fill(&search, bp);
- dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
- if (dde == NULL) {
- if (!add)
- return (NULL);
- dde = ddt_alloc(&dde_search.dde_key);
- avl_insert(&ddt->ddt_tree, dde, where);
- }
+ /* Find an existing live entry */
+ dde = avl_find(&ddt->ddt_tree, &search, &where);
+ if (dde != NULL) {
+ /* Found it. If it's already loaded, we can just return it. */
+ if (dde->dde_flags & DDE_FLAG_LOADED)
+ return (dde);
- while (dde->dde_loading)
- cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+ /* Someone else is loading it, wait for it. */
+ while (!(dde->dde_flags & DDE_FLAG_LOADED))
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
- if (dde->dde_loaded)
return (dde);
+ }
- dde->dde_loading = B_TRUE;
+ /* Not found. */
+ if (!add)
+ return (NULL);
+ /* Time to make a new entry. */
+ dde = ddt_alloc(&search);
+ avl_insert(&ddt->ddt_tree, dde, where);
+
+ /*
+ * ddt_tree is now stable, so unlock and let everyone else keep moving.
+ * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
+ * and go to sleep waiting for it above.
+ */
ddt_exit(ddt);
+ /* Search all store objects for the entry. */
error = ENOENT;
-
for (type = 0; type < DDT_TYPES; type++) {
for (class = 0; class < DDT_CLASSES; class++) {
error = ddt_object_lookup(ddt, type, class, dde);
@@ -712,17 +634,16 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
ddt_enter(ddt);
- ASSERT(dde->dde_loaded == B_FALSE);
- ASSERT(dde->dde_loading == B_TRUE);
+ ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
dde->dde_type = type; /* will be DDT_TYPES if no entry found */
dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
- dde->dde_loaded = B_TRUE;
- dde->dde_loading = B_FALSE;
if (error == 0)
ddt_stat_update(ddt, dde, -1ULL);
+ /* Entry loaded, everyone can proceed now */
+ dde->dde_flags |= DDE_FLAG_LOADED;
cv_broadcast(&dde->dde_cv);
return (dde);
@@ -732,7 +653,7 @@ void
ddt_prefetch(spa_t *spa, const blkptr_t *bp)
{
ddt_t *ddt;
- ddt_entry_t dde;
+ ddt_key_t ddk;
if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
return;
@@ -743,17 +664,18 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
* Thus no locking is required as the DDT can't disappear on us.
*/
ddt = ddt_select(spa, bp);
- ddt_key_fill(&dde.dde_key, bp);
+ ddt_key_fill(&ddk, bp);
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
- ddt_object_prefetch(ddt, type, class, &dde);
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+ ddt_object_prefetch(ddt, type, class, &ddk);
}
}
}
/*
- * Opaque struct used for ddt_key comparison
+ * Key comparison. Any struct wanting to make use of this function must have
+ * the key as the first element.
*/
#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t))
@@ -762,12 +684,10 @@ typedef struct ddt_key_cmp {
} ddt_key_cmp_t;
int
-ddt_entry_compare(const void *x1, const void *x2)
+ddt_key_compare(const void *x1, const void *x2)
{
- const ddt_entry_t *dde1 = x1;
- const ddt_entry_t *dde2 = x2;
- const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key;
- const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key;
+ const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1;
+ const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2;
int32_t cmp = 0;
for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
@@ -785,12 +705,12 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
ddt_t *ddt;
ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
- bzero(ddt, sizeof (ddt_t));
+ memset(ddt, 0, sizeof (ddt_t));
mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ avl_create(&ddt->ddt_tree, ddt_key_compare,
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
- avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
ddt->ddt_checksum = c;
ddt->ddt_spa = spa;
@@ -802,8 +722,8 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
static void
ddt_table_free(ddt_t *ddt)
{
- ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
- ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ ASSERT0(avl_numnodes(&ddt->ddt_tree));
+ ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
avl_destroy(&ddt->ddt_tree);
avl_destroy(&ddt->ddt_repair_tree);
mutex_destroy(&ddt->ddt_lock);
@@ -815,8 +735,10 @@ ddt_create(spa_t *spa)
{
spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
- spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (DDT_CHECKSUM_VALID(c))
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+ }
}
int
@@ -834,9 +756,12 @@ ddt_load(spa_t *spa)
return (error == ENOENT ? 0 : error);
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (!DDT_CHECKSUM_VALID(c))
+ continue;
+
ddt_t *ddt = spa->spa_ddt[c];
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES;
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) {
error = ddt_object_load(ddt, type, class);
if (error != 0 && error != ENOENT)
@@ -847,7 +772,7 @@ ddt_load(spa_t *spa)
/*
* Seed the cached histograms.
*/
- bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL;
}
@@ -867,10 +792,10 @@ ddt_unload(spa_t *spa)
}
boolean_t
-ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
{
ddt_t *ddt;
- ddt_entry_t *dde;
+ ddt_key_t ddk;
if (!BP_GET_DEDUP(bp))
return (B_FALSE);
@@ -879,20 +804,16 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
return (B_TRUE);
ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
- dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
- ddt_key_fill(&(dde->dde_key), bp);
+ ddt_key_fill(&ddk, bp);
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class <= max_class; class++) {
- if (ddt_object_lookup(ddt, type, class, dde) == 0) {
- kmem_cache_free(ddt_entry_cache, dde);
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class <= max_class; class++) {
+ if (ddt_object_contains(ddt, type, class, &ddk) == 0)
return (B_TRUE);
- }
}
}
- kmem_cache_free(ddt_entry_cache, dde);
return (B_FALSE);
}
@@ -906,8 +827,8 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
dde = ddt_alloc(&ddk);
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
/*
* We can only do repair if there are multiple copies
* of the block. For anything in the UNIQUE class,
@@ -919,7 +840,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
}
}
- bzero(dde->dde_phys, sizeof (dde->dde_phys));
+ memset(dde->dde_phys, 0, sizeof (dde->dde_phys));
return (dde);
}
@@ -964,7 +885,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
- bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
@@ -1006,19 +927,18 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
ddt_phys_t *ddp = dde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
- enum ddt_type otype = dde->dde_type;
- enum ddt_type ntype = DDT_TYPE_CURRENT;
- enum ddt_class oclass = dde->dde_class;
- enum ddt_class nclass;
+ ddt_type_t otype = dde->dde_type;
+ ddt_type_t ntype = DDT_TYPE_DEFAULT;
+ ddt_class_t oclass = dde->dde_class;
+ ddt_class_t nclass;
uint64_t total_refcnt = 0;
- ASSERT(dde->dde_loaded);
- ASSERT(!dde->dde_loading);
+ ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
- ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
if (ddp->ddp_phys_birth == 0) {
- ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT0(ddp->ddp_refcnt);
continue;
}
if (p == DDT_PHYS_DITTO) {
@@ -1043,8 +963,9 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
if (otype != DDT_TYPES &&
(otype != ntype || oclass != nclass || total_refcnt == 0)) {
- VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
- ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
+ ASSERT3U(
+ ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
}
if (total_refcnt != 0) {
@@ -1053,7 +974,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
ddt_stat_update(ddt, dde, 0);
if (!ddt_object_exists(ddt, ntype, nclass))
ddt_object_create(ddt, ntype, nclass, tx);
- VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+ VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
/*
* If the class changes, the order that we scan this bp
@@ -1079,7 +1000,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
if (avl_numnodes(&ddt->ddt_tree) == 0)
return;
- ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+ ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
if (spa->spa_ddt_stat_object == 0) {
spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
@@ -1092,23 +1013,23 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
ddt_free(dde);
}
- for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
uint64_t add, count = 0;
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
if (ddt_object_exists(ddt, type, class)) {
ddt_object_sync(ddt, type, class, tx);
- VERIFY(ddt_object_count(ddt, type, class,
- &add) == 0);
+ VERIFY0(ddt_object_count(ddt, type, class,
+ &add));
count += add;
}
}
- for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
if (count == 0 && ddt_object_exists(ddt, type, class))
ddt_object_destroy(ddt, type, class, tx);
}
}
- bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+ memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
sizeof (ddt->ddt_histogram));
spa->spa_dedup_dspace = ~0ULL;
}
@@ -1120,7 +1041,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
dmu_tx_t *tx;
zio_t *rio;
- ASSERT(spa_syncing_txg(spa) == txg);
+ ASSERT3U(spa_syncing_txg(spa), ==, txg);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1157,6 +1078,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
do {
do {
ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ if (ddt == NULL)
+ continue;
int error = ENOENT;
if (ddt_object_exists(ddt, ddb->ddb_type,
ddb->ddb_class)) {
@@ -1180,7 +1103,68 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
return (SET_ERROR(ENOENT));
}
-/* BEGIN CSTYLED */
+/*
+ * This function is used by Block Cloning (brt.c) to increase reference
+ * counter for the DDT entry if the block is already in DDT.
+ *
+ * Return false if the block, despite having the D bit set, is not present
+ * in the DDT. Currently this is not possible but might be in the future.
+ * See the comment below.
+ */
+boolean_t
+ddt_addref(spa_t *spa, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t *dde;
+ boolean_t result;
+
+ spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+ ddt = ddt_select(spa, bp);
+ ddt_enter(ddt);
+
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ASSERT3P(dde, !=, NULL);
+
+ if (dde->dde_type < DDT_TYPES) {
+ ddt_phys_t *ddp;
+
+ ASSERT3S(dde->dde_class, <, DDT_CLASSES);
+
+ ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
+
+ /*
+ * This entry already existed (dde_type is real), so it must
+ * have refcnt >0 at the start of this txg. We are called from
+ * brt_pending_apply(), before frees are issued, so the refcnt
+ * can't be lowered yet. Therefore, it must be >0. We assert
+ * this because if the order of BRT and DDT interactions were
+ * ever to change and the refcnt was ever zero here, then
+ * likely further action is required to fill out the DDT entry,
+ * and this is a place that is likely to be missed in testing.
+ */
+ ASSERT3U(ddp->ddp_refcnt, >, 0);
+
+ ddt_phys_addref(ddp);
+ result = B_TRUE;
+ } else {
+ /*
+ * At the time of implementating this if the block has the
+ * DEDUP flag set it must exist in the DEDUP table, but
+ * there are many advocates that want ability to remove
+ * entries from DDT with refcnt=1. If this will happen,
+ * we may have a block with the DEDUP set, but which doesn't
+ * have a corresponding entry in the DDT. Be ready.
+ */
+ ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
+ ddt_remove(ddt, dde);
+ result = B_FALSE;
+ }
+
+ ddt_exit(ddt);
+ spa_config_exit(spa, SCL_ZIO, FTAG);
+
+ return (result);
+}
+
ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
"Enable prefetching dedup-ed blks");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt_stats.c b/sys/contrib/openzfs/module/zfs/ddt_stats.c
new file mode 100644
index 000000000000..af5365a1d114
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/ddt_stats.c
@@ -0,0 +1,212 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/ddt.h>
+#include <sys/ddt_impl.h>
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
+
+ memset(dds, 0, sizeof (*dds));
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
+ int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
+ SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+ for (int d = 0; d < ndvas; d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ for (int i = 0; i < d_end - d; i++)
+ d[i] += (s[i] ^ neg) - neg;
+}
+
+void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit64(dds.dds_ref_blocks) - 1;
+ ASSERT3U(bucket, >=, 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+ memset(dds, 0, sizeof (*dds));
+
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+ /* Sum the statistics we cached in ddt_object_sync(). */
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (!ddt)
+ continue;
+
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_object_t *ddo =
+ &ddt->ddt_object_stats[type][class];
+ ddo_total->ddo_count += ddo->ddo_count;
+ ddo_total->ddo_dspace += ddo->ddo_dspace;
+ ddo_total->ddo_mspace += ddo->ddo_mspace;
+ }
+ }
+ }
+
+ /* ... and compute the averages. */
+ if (ddo_total->ddo_count != 0) {
+ ddo_total->ddo_dspace /= ddo_total->ddo_count;
+ ddo_total->ddo_mspace /= ddo_total->ddo_count;
+ }
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (!ddt)
+ continue;
+
+ for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+ for (ddt_class_t class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(ddh,
+ &ddt->ddt_histogram_cache[type][class]);
+ }
+ }
+ }
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+ ddt_histogram_t *ddh_total;
+
+ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+ ddt_get_dedup_histogram(spa, ddh_total);
+ ddt_histogram_stat(dds_total, ddh_total);
+ kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+ ddt_stat_t dds_total;
+
+ if (spa->spa_dedup_dspace != ~0ULL)
+ return (spa->spa_dedup_dspace);
+
+ memset(&dds_total, 0, sizeof (ddt_stat_t));
+
+ /* Calculate and cache the stats */
+ ddt_get_dedup_stats(spa, &dds_total);
+ spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
+ return (spa->spa_dedup_dspace);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_get_dedup_stats(spa, &dds_total);
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c
index c5c9eda0b2d0..741554de3c60 100644
--- a/sys/contrib/openzfs/module/zfs/ddt_zap.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -28,11 +28,60 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/ddt.h>
+#include <sys/ddt_impl.h>
#include <sys/zap.h>
#include <sys/dmu_tx.h>
+#include <sys/zio_compress.h>
-int ddt_zap_leaf_blockshift = 12;
-int ddt_zap_indirect_blockshift = 12;
+static unsigned int ddt_zap_default_bs = 15;
+static unsigned int ddt_zap_default_ibs = 15;
+
+#define DDT_ZAP_COMPRESS_BYTEORDER_MASK 0x80
+#define DDT_ZAP_COMPRESS_FUNCTION_MASK 0x7f
+
+#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))
+
+static size_t
+ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
+ ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ memcpy(dst, src, s_len);
+ }
+
+ *version = cpfunc;
+ if (ZFS_HOST_BYTEORDER)
+ *version |= DDT_ZAP_COMPRESS_BYTEORDER_MASK;
+
+ return (c_len + 1);
+}
+
+static void
+ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ memcpy(dst, src, d_len);
+
+ if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
+ (ZFS_HOST_BYTEORDER != 0))
+ byteswap_uint64_array(dst, d_len);
+}
static int
ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
@@ -43,10 +92,12 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
flags |= ZAP_FLAG_PRE_HASHED_KEY;
*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
- ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+ ddt_zap_default_bs, ddt_zap_default_ibs,
DMU_OT_NONE, 0, tx);
+ if (*objectp == 0)
+ return (SET_ERROR(ENOTSUP));
- return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
+ return (0);
}
static int
@@ -56,63 +107,75 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
}
static int
-ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+ddt_zap_lookup(objset_t *os, uint64_t object,
+ const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
{
uchar_t *cbuf;
uint64_t one, csize;
int error;
- cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_SLEEP);
-
- error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+ error = zap_length_uint64(os, object, (uint64_t *)ddk,
DDT_KEY_WORDS, &one, &csize);
if (error)
- goto out;
+ return (error);
- ASSERT(one == 1);
- ASSERT(csize <= (sizeof (dde->dde_phys) + 1));
+ ASSERT3U(one, ==, 1);
+ ASSERT3U(csize, <=, psize + 1);
- error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+ cbuf = kmem_alloc(csize, KM_SLEEP);
+
+ error = zap_lookup_uint64(os, object, (uint64_t *)ddk,
DDT_KEY_WORDS, 1, csize, cbuf);
- if (error)
- goto out;
+ if (error == 0)
+ ddt_zap_decompress(cbuf, phys, csize, psize);
- ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
-out:
- kmem_free(cbuf, sizeof (dde->dde_phys) + 1);
+ kmem_free(cbuf, csize);
return (error);
}
+static int
+ddt_zap_contains(objset_t *os, uint64_t object, const ddt_key_t *ddk)
+{
+ return (zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS,
+ NULL, NULL));
+}
+
static void
-ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk)
{
- (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
- DDT_KEY_WORDS);
+ (void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS);
}
static int
-ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
+ const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
{
- uchar_t cbuf[sizeof (dde->dde_phys) + 1];
- uint64_t csize;
+ const size_t cbuf_size = psize + 1;
+
+ uchar_t *cbuf = kmem_alloc(cbuf_size, KM_SLEEP);
+
+ uint64_t csize = ddt_zap_compress(phys, cbuf, psize, cbuf_size);
- csize = ddt_compress(dde->dde_phys, cbuf,
- sizeof (dde->dde_phys), sizeof (cbuf));
+ int error = zap_update_uint64(os, object, (uint64_t *)ddk,
+ DDT_KEY_WORDS, 1, csize, cbuf, tx);
- return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
- DDT_KEY_WORDS, 1, csize, cbuf, tx));
+ kmem_free(cbuf, cbuf_size);
+
+ return (error);
}
static int
-ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
+ dmu_tx_t *tx)
{
- return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+ return (zap_remove_uint64(os, object, (uint64_t *)ddk,
DDT_KEY_WORDS, tx));
}
static int
-ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
+ ddt_phys_t *phys, size_t psize)
{
zap_cursor_t zc;
zap_attribute_t za;
@@ -131,17 +194,23 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
zap_cursor_init_serialized(&zc, os, object, *walk);
}
if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
- uchar_t cbuf[sizeof (dde->dde_phys) + 1];
uint64_t csize = za.za_num_integers;
- ASSERT(za.za_integer_length == 1);
+
+ ASSERT3U(za.za_integer_length, ==, 1);
+ ASSERT3U(csize, <=, psize + 1);
+
+ uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP);
+
error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
DDT_KEY_WORDS, 1, csize, cbuf);
- ASSERT(error == 0);
+ ASSERT0(error);
if (error == 0) {
- ddt_decompress(cbuf, dde->dde_phys, csize,
- sizeof (dde->dde_phys));
- dde->dde_key = *(ddt_key_t *)za.za_name;
+ ddt_zap_decompress(cbuf, phys, csize, psize);
+ *ddk = *(ddt_key_t *)za.za_name;
}
+
+ kmem_free(cbuf, csize);
+
zap_cursor_advance(&zc);
*walk = zap_cursor_serialize(&zc);
}
@@ -160,9 +229,17 @@ const ddt_ops_t ddt_zap_ops = {
ddt_zap_create,
ddt_zap_destroy,
ddt_zap_lookup,
+ ddt_zap_contains,
ddt_zap_prefetch,
ddt_zap_update,
ddt_zap_remove,
ddt_zap_walk,
ddt_zap_count,
};
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW,
+ "DDT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW,
+ "DDT ZAP indirect blockshift");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index eee3e70bbc95..8b440aafba43 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -28,6 +28,8 @@
* Copyright (c) 2019 Datto Inc.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/dmu.h>
@@ -51,6 +53,7 @@
#include <sys/sa.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
+#include <sys/brt.h>
#include <sys/trace_zfs.h>
#include <sys/zfs_racct.h>
#include <sys/zfs_rlock.h>
@@ -62,7 +65,7 @@
/*
* Enable/disable nopwrite feature.
*/
-int zfs_nopwrite_enabled = 1;
+static int zfs_nopwrite_enabled = 1;
/*
* Tunable to control percentage of dirtied L1 blocks from frees allowed into
@@ -70,19 +73,27 @@ int zfs_nopwrite_enabled = 1;
* will wait until the next TXG.
* A value of zero will disable this throttle.
*/
-unsigned long zfs_per_txg_dirty_frees_percent = 5;
+static uint_t zfs_per_txg_dirty_frees_percent = 30;
/*
- * Enable/disable forcing txg sync when dirty in dmu_offset_next.
+ * Enable/disable forcing txg sync when dirty checking for holes with lseek().
+ * By default this is enabled to ensure accurate hole reporting, it can result
+ * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
+ * Disabling this option will result in holes never being reported in dirty
+ * files which is always safe.
*/
-int zfs_dmu_offset_next_sync = 0;
+static int zfs_dmu_offset_next_sync = 1;
/*
* Limit the amount we can prefetch with one call to this amount. This
* helps to limit the amount of memory that can be used by prefetching.
* Larger objects should be prefetched a bit at a time.
*/
-int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+#ifdef _ILP32
+uint_t dmu_prefetch_max = 8 * 1024 * 1024;
+#else
+uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+#endif
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
@@ -141,7 +152,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" }
};
-const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
{ byteswap_uint8_array, "uint8" },
{ byteswap_uint16_array, "uint16" },
{ byteswap_uint32_array, "uint32" },
@@ -154,9 +165,9 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
{ zfs_acl_byteswap, "acl" }
};
-static int
+int
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
- void *tag, dmu_buf_t **dbp)
+ const void *tag, dmu_buf_t **dbp)
{
uint64_t blkid;
dmu_buf_impl_t *db;
@@ -174,9 +185,10 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
*dbp = &db->db;
return (0);
}
+
int
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp)
+ const void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
uint64_t blkid;
@@ -203,7 +215,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
int
dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
- void *tag, dmu_buf_t **dbp, int flags)
+ const void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
@@ -228,7 +240,7 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
int
dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp, int flags)
+ const void *tag, dmu_buf_t **dbp, int flags)
{
int err;
int db_flags = DB_RF_CANFAIL;
@@ -338,7 +350,7 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
* has not yet been allocated a new bonus dbuf a will be allocated.
* Returns ENOENT, EIO, or 0.
*/
-int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
+int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
uint32_t flags)
{
dmu_buf_impl_t *db;
@@ -352,8 +364,10 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
- rw_exit(&dn->dn_struct_rwlock);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+ rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ }
if (dn->dn_bonus == NULL)
dbuf_create_bonus(dn);
}
@@ -385,7 +399,7 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
}
int
-dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
int error;
@@ -410,7 +424,8 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
* dmu_spill_hold_existing() should be used.
*/
int
-dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
+ dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = NULL;
int err;
@@ -438,7 +453,7 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
}
int
-dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
dnode_t *dn;
@@ -467,7 +482,7 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
}
int
-dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag,
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
dmu_buf_t **dbp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
@@ -494,7 +509,8 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag,
*/
int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
- boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+ boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
+ uint32_t flags)
{
dmu_buf_t **dbp;
zstream_t *zs = NULL;
@@ -504,7 +520,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zio_t *zio = NULL;
boolean_t missed = B_FALSE;
- ASSERT(length <= DMU_MAX_ACCESS);
+ ASSERT(!read || length <= DMU_MAX_ACCESS);
/*
* Note: We directly notify the prefetch code of this read, so that
@@ -514,11 +530,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
DB_RF_NOPREFETCH;
+ if ((flags & DMU_READ_NO_DECRYPT) != 0)
+ dbuf_flags |= DB_RF_NO_DECRYPT;
+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
- P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
+ P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t))
+ >> blkshift;
} else {
if (offset + length > dn->dn_datablksz) {
zfs_panic_recover("zfs: accessing past end of object "
@@ -538,21 +558,22 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
- if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
- DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+ if ((flags & DMU_READ_NO_PREFETCH) == 0) {
/*
* Prepare the zfetch before initiating the demand reads, so
* that if multiple threads block on same indirect block, we
* base predictions on the original less racy request order.
*/
- zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
- read && DNODE_IS_CACHEABLE(dn), B_TRUE);
+ zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
+ B_TRUE);
}
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
- if (zs)
- dmu_zfetch_run(zs, missed, B_TRUE);
+ if (zs) {
+ dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
+ B_TRUE);
+ }
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
if (read)
@@ -569,6 +590,14 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
* state will not yet be CACHED.
*/
if (read) {
+ if (i == nblks - 1 && blkid + i < dn->dn_maxblkid &&
+ offset + length < db->db.db_offset +
+ db->db.db_size) {
+ if (offset <= db->db.db_offset)
+ dbuf_flags |= DB_RF_PARTIAL_FIRST;
+ else
+ dbuf_flags |= DB_RF_PARTIAL_MORE;
+ }
(void) dbuf_read(db, zio, dbuf_flags);
if (db->db_state != DB_CACHED)
missed = B_TRUE;
@@ -580,7 +609,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zfs_racct_write(length, nblks);
if (zs)
- dmu_zfetch_run(zs, missed, B_TRUE);
+ dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);
if (read) {
@@ -615,7 +644,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
int
dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+ uint64_t length, int read, const void *tag, int *numbufsp,
+ dmu_buf_t ***dbpp)
{
dnode_t *dn;
int err;
@@ -634,7 +664,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
int
dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
- uint64_t length, boolean_t read, void *tag, int *numbufsp,
+ uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
@@ -651,7 +681,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
}
void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
{
int i;
dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
@@ -668,72 +698,99 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
}
/*
- * Issue prefetch i/os for the given blocks. If level is greater than 0, the
+ * Issue prefetch I/Os for the given blocks. If level is greater than 0, the
* indirect blocks prefetched will be those that point to the blocks containing
- * the data starting at offset, and continuing to offset + len.
+ * the data starting at offset, and continuing to offset + len. If the range
+ * it too long, prefetch the first dmu_prefetch_max bytes as requested, while
+ * for the rest only a higher level, also fitting within dmu_prefetch_max. It
+ * should primarily help random reads, since for long sequential reads there is
+ * a speculative prefetcher.
*
* Note that if the indirect blocks above the blocks being prefetched are not
- * in cache, they will be asynchronously read in.
+ * in cache, they will be asynchronously read in. Dnode read by dnode_hold()
+ * is currently synchronous.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
- uint64_t blkid;
- int nblks, err;
-
- if (len == 0) { /* they're interested in the bonus buffer */
- dn = DMU_META_DNODE(os);
- if (object == 0 || object >= DN_MAX_OBJECT)
- return;
-
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- blkid = dbuf_whichblock(dn, level,
- object * sizeof (dnode_phys_t));
- dbuf_prefetch(dn, level, blkid, pri, 0);
- rw_exit(&dn->dn_struct_rwlock);
+ if (dmu_prefetch_max == 0 || len == 0) {
+ dmu_prefetch_dnode(os, object, pri);
return;
}
- /*
- * See comment before the definition of dmu_prefetch_max.
- */
- len = MIN(len, dmu_prefetch_max);
-
- /*
- * XXX - Note, if the dnode for the requested object is not
- * already cached, we will do a *synchronous* read in the
- * dnode_hold() call. The same is true for any indirects.
- */
- err = dnode_hold(os, object, FTAG, &dn);
- if (err != 0)
+ if (dnode_hold(os, object, FTAG, &dn) != 0)
return;
+ dmu_prefetch_by_dnode(dn, level, offset, len, pri);
+
+ dnode_rele(dn, FTAG);
+}
+
+void
+dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+ uint64_t len, zio_priority_t pri)
+{
+ int64_t level2 = level;
+ uint64_t start, end, start2, end2;
+
/*
- * offset + len - 1 is the last byte we want to prefetch for, and offset
- * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
- * last block we want to prefetch, and dbuf_whichblock(dn, level,
- * offset) is the first. Then the number we need to prefetch is the
- * last - first + 1.
+ * Depending on len we may do two prefetches: blocks [start, end) at
+ * level, and following blocks [start2, end2) at higher level2.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (level > 0 || dn->dn_datablkshift != 0) {
- nblks = dbuf_whichblock(dn, level, offset + len - 1) -
- dbuf_whichblock(dn, level, offset) + 1;
+ if (dn->dn_datablkshift != 0) {
+ /*
+ * The object has multiple blocks. Calculate the full range
+ * of blocks [start, end2) and then split it into two parts,
+ * so that the first [start, end) fits into dmu_prefetch_max.
+ */
+ start = dbuf_whichblock(dn, level, offset);
+ end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
+ uint8_t ibs = dn->dn_indblkshift;
+ uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
+ uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
+ start2 = end = MIN(end2, start + limit);
+
+ /*
+ * Find level2 where [start2, end2) fits into dmu_prefetch_max.
+ */
+ uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
+ limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+ do {
+ level2++;
+ start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
+ end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
+ } while (end2 - start2 > limit);
} else {
- nblks = (offset < dn->dn_datablksz);
+ /* There is only one block. Prefetch it or nothing. */
+ start = start2 = end2 = 0;
+ end = start + (level == 0 && offset < dn->dn_datablksz);
}
- if (nblks != 0) {
- blkid = dbuf_whichblock(dn, level, offset);
- for (int i = 0; i < nblks; i++)
- dbuf_prefetch(dn, level, blkid + i, pri, 0);
- }
+ for (uint64_t i = start; i < end; i++)
+ dbuf_prefetch(dn, level, i, pri, 0);
+ for (uint64_t i = start2; i < end2; i++)
+ dbuf_prefetch(dn, level2, i, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
+}
- dnode_rele(dn, FTAG);
+/*
+ * Issue prefetch I/Os for the given object's dnode.
+ */
+void
+dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
+{
+ if (object == 0 || object >= DN_MAX_OBJECT)
+ return;
+
+ dnode_t *dn = DMU_META_DNODE(os);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, 0, blkid, pri, 0);
+ rw_exit(&dn->dn_struct_rwlock);
}
/*
@@ -798,7 +855,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
}
/* set start to the beginning of this L1 indirect */
- *start = P2ALIGN(*start, iblkrange);
+ *start = P2ALIGN_TYPED(*start, iblkrange, uint64_t);
}
if (*start < minimum)
*start = minimum;
@@ -812,13 +869,14 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
* otherwise return false.
* Used below in dmu_free_long_range_impl() to enable abort when unmounting
*/
-/*ARGSUSED*/
static boolean_t
dmu_objset_zfs_unmounting(objset_t *os)
{
#ifdef _KERNEL
if (dmu_objset_type(os) == DMU_OST_ZFS)
return (zfs_get_vfs_flag_unmounted(os));
+#else
+ (void) os;
#endif
return (B_FALSE);
}
@@ -1007,7 +1065,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
if (dn->dn_maxblkid == 0) {
uint64_t newsz = offset > dn->dn_datablksz ? 0 :
MIN(size, dn->dn_datablksz - offset);
- bzero((char *)buf + newsz, size - newsz);
+ memset((char *)buf + newsz, 0, size - newsz);
size = newsz;
}
@@ -1087,14 +1145,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
- dmu_buf_will_fill(db, tx);
+ dmu_buf_will_fill(db, tx, B_FALSE);
else
dmu_buf_will_dirty(db, tx);
(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
if (tocpy == db->db_size)
- dmu_buf_fill_done(db, tx);
+ dmu_buf_fill_done(db, tx, B_FALSE);
offset += tocpy;
size -= tocpy;
@@ -1302,27 +1360,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
ASSERT(size > 0);
- bufoff = zfs_uio_offset(uio) - db->db_offset;
+ offset_t off = zfs_uio_offset(uio);
+ bufoff = off - db->db_offset;
tocpy = MIN(db->db_size - bufoff, size);
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
if (tocpy == db->db_size)
- dmu_buf_will_fill(db, tx);
+ dmu_buf_will_fill(db, tx, B_TRUE);
else
dmu_buf_will_dirty(db, tx);
- /*
- * XXX zfs_uiomove could block forever (eg.nfs-backed
- * pages). There needs to be a uiolockdown() function
- * to lock the pages in memory, so that zfs_uiomove won't
- * block.
- */
err = zfs_uio_fault_move((char *)db->db_data + bufoff,
tocpy, UIO_WRITE, uio);
- if (tocpy == db->db_size)
- dmu_buf_fill_done(db, tx);
+ if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
+ /* The fill was reverted. Undo any uio progress. */
+ zfs_uio_advance(uio, off - zfs_uio_offset(uio));
+ }
if (err)
break;
@@ -1424,7 +1479,7 @@ dmu_return_arcbuf(arc_buf_t *buf)
*/
int
dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
- const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx)
+ const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx)
{
dbuf_dirty_record_t *dr =
dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
@@ -1454,9 +1509,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
db = dbuf_hold(dn, blkid, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
if (db == NULL)
return (SET_ERROR(EIO));
- rw_exit(&dn->dn_struct_rwlock);
/*
* We can only assign if the offset is aligned and the arc buf is the
@@ -1500,10 +1555,10 @@ typedef struct {
dmu_tx_t *dsa_tx;
} dmu_sync_arg_t;
-/* ARGSUSED */
static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
+ (void) buf;
dmu_sync_arg_t *dsa = varg;
dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
blkptr_t *bp = zio->io_bp;
@@ -1528,10 +1583,10 @@ dmu_sync_late_arrival_ready(zio_t *zio)
dmu_sync_ready(zio, NULL, zio->io_private);
}
-/* ARGSUSED */
static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
+ (void) buf;
dmu_sync_arg_t *dsa = varg;
dbuf_dirty_record_t *dr = dsa->dsa_dr;
dmu_buf_impl_t *db = dr->dr_dbuf;
@@ -1575,7 +1630,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
* it's an old style hole.
*/
if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
- dr->dt.dl.dr_overridden_by.blk_birth == 0)
+ BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
BP_ZERO(&dr->dt.dl.dr_overridden_by);
} else {
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1606,7 +1661,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
- ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
}
@@ -1626,10 +1681,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
{
dmu_sync_arg_t *dsa;
dmu_tx_t *tx;
+ int error;
+
+ error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
+ DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+ if (error != 0)
+ return (error);
tx = dmu_tx_create(os);
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
- if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+ /*
+ * This transaction does not produce any dirty data or log blocks, so
+ * it should not be throttled. All other cases wait for TXG sync, by
+ * which time the log block we are writing will be obsolete, so we can
+ * skip waiting and just return error here instead.
+ */
+ if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
dmu_tx_abort(tx);
/* Make zl_get_data do txg_waited_synced() */
return (SET_ERROR(EIO));
@@ -1674,7 +1741,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
- dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+ dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
@@ -1838,9 +1905,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
dsa->dsa_zgd = zgd;
dsa->dsa_tx = NULL;
- zio_nowait(arc_write(pio, os->os_spa, txg,
- zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db),
- &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+ zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
+ dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
+ &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);
@@ -1935,7 +2002,7 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
* When the "redundant_metadata" property is set to "most", only indirect
* blocks of this level and higher will have an additional ditto block.
*/
-int zfs_redundant_metadata_most_ditto_level = 2;
+static const int zfs_redundant_metadata_most_ditto_level = 2;
void
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
@@ -1981,12 +2048,22 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
ZCHECKSUM_FLAG_EMBEDDED))
checksum = ZIO_CHECKSUM_FLETCHER_4;
- if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
- (os->os_redundant_metadata ==
- ZFS_REDUNDANT_METADATA_MOST &&
- (level >= zfs_redundant_metadata_most_ditto_level ||
- DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+ switch (os->os_redundant_metadata) {
+ case ZFS_REDUNDANT_METADATA_ALL:
copies++;
+ break;
+ case ZFS_REDUNDANT_METADATA_MOST:
+ if (level >= zfs_redundant_metadata_most_ditto_level ||
+ DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
+ copies++;
+ break;
+ case ZFS_REDUNDANT_METADATA_SOME:
+ if (DMU_OT_IS_CRITICAL(type))
+ copies++;
+ break;
+ case ZFS_REDUNDANT_METADATA_NONE:
+ break;
+ }
} else if (wp & WP_NOFILL) {
ASSERT(level == 0);
@@ -2072,9 +2149,9 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_nopwrite = nopwrite;
zp->zp_encrypt = encrypt;
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
- bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
- bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
- bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+ memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
+ memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
+ memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
os->os_zpl_special_smallblock : 0;
@@ -2082,18 +2159,18 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
}
/*
- * This function is only called from zfs_holey_common() for zpl_llseek()
- * in order to determine the location of holes. In order to accurately
- * report holes all dirty data must be synced to disk. This causes extremely
- * poor performance when seeking for holes in a dirty file. As a compromise,
- * only provide hole data when the dnode is clean. When a dnode is dirty
- * report the dnode as having no holes which is always a safe thing to do.
+ * Reports the location of data and holes in an object. In order to
+ * accurately report holes all dirty data must be synced to disk. This
+ * causes extremely poor performance when seeking for holes in a dirty file.
+ * As a compromise, only provide hole data when the dnode is clean. When
+ * a dnode is dirty report the dnode as having no holes by returning EBUSY
+ * which is always safe to do.
*/
int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
- int err;
+ int restarted = 0, err;
restart:
err = dnode_hold(os, object, FTAG, &dn);
@@ -2105,19 +2182,23 @@ restart:
if (dnode_is_dirty(dn)) {
/*
* If the zfs_dmu_offset_next_sync module option is enabled
- * then strict hole reporting has been requested. Dirty
- * dnodes must be synced to disk to accurately report all
- * holes. When disabled (the default) dirty dnodes are
- * reported to not have any holes which is always safe.
+ * then hole reporting has been requested. Dirty dnodes
+ * must be synced to disk to accurately report holes.
*
- * When called by zfs_holey_common() the zp->z_rangelock
- * is held to prevent zfs_write() and mmap writeback from
- * re-dirtying the dnode after txg_wait_synced().
+ * Provided a RL_READER rangelock spanning 0-UINT64_MAX is
+ * held by the caller only a single restart will be required.
+ * We tolerate callers which do not hold the rangelock by
+ * returning EBUSY and not reporting holes after one restart.
*/
if (zfs_dmu_offset_next_sync) {
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
+
+ if (restarted)
+ return (SET_ERROR(EBUSY));
+
txg_wait_synced(dmu_objset_pool(os), 0);
+ restarted = 1;
goto restart;
}
@@ -2133,6 +2214,187 @@ restart:
return (err);
}
+int
+dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+ blkptr_t *bps, size_t *nbpsp)
+{
+ dmu_buf_t **dbp, *dbuf;
+ dmu_buf_impl_t *db;
+ blkptr_t *bp;
+ int error, numbufs;
+
+ error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+ &numbufs, &dbp);
+ if (error != 0) {
+ if (error == ESRCH) {
+ error = SET_ERROR(ENXIO);
+ }
+ return (error);
+ }
+
+ ASSERT3U(numbufs, <=, *nbpsp);
+
+ for (int i = 0; i < numbufs; i++) {
+ dbuf = dbp[i];
+ db = (dmu_buf_impl_t *)dbuf;
+
+ mutex_enter(&db->db_mtx);
+
+ if (!list_is_empty(&db->db_dirty_records)) {
+ dbuf_dirty_record_t *dr;
+
+ dr = list_head(&db->db_dirty_records);
+ if (dr->dt.dl.dr_brtwrite) {
+ /*
+ * This is very special case where we clone a
+ * block and in the same transaction group we
+ * read its BP (most likely to clone the clone).
+ */
+ bp = &dr->dt.dl.dr_overridden_by;
+ } else {
+ /*
+ * The block was modified in the same
+ * transaction group.
+ */
+ mutex_exit(&db->db_mtx);
+ error = SET_ERROR(EAGAIN);
+ goto out;
+ }
+ } else {
+ bp = db->db_blkptr;
+ }
+
+ mutex_exit(&db->db_mtx);
+
+ if (bp == NULL) {
+ /*
+ * The file size was increased, but the block was never
+ * written, otherwise we would either have the block
+ * pointer or the dirty record and would not get here.
+ * It is effectively a hole, so report it as such.
+ */
+ BP_ZERO(&bps[i]);
+ continue;
+ }
+ /*
+ * Make sure we clone only data blocks.
+ */
+ if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /*
+ * If the block was allocated in transaction group that is not
+ * yet synced, we could clone it, but we couldn't write this
+ * operation into ZIL, or it may be impossible to replay, since
+ * the block may appear not yet allocated at that point.
+ */
+ if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+ error = SET_ERROR(EAGAIN);
+ goto out;
+ }
+
+ bps[i] = *bp;
+ }
+
+ *nbpsp = numbufs;
+out:
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (error);
+}
+
+int
+dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+ dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
+{
+ spa_t *spa;
+ dmu_buf_t **dbp, *dbuf;
+ dmu_buf_impl_t *db;
+ struct dirty_leaf *dl;
+ dbuf_dirty_record_t *dr;
+ const blkptr_t *bp;
+ int error = 0, i, numbufs;
+
+ spa = os->os_spa;
+
+ VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+ &numbufs, &dbp));
+ ASSERT3U(nbps, ==, numbufs);
+
+ /*
+ * Before we start cloning make sure that the dbufs sizes match new BPs
+ * sizes. If they don't, that's a no-go, as we are not able to shrink
+ * dbufs.
+ */
+ for (i = 0; i < numbufs; i++) {
+ dbuf = dbp[i];
+ db = (dmu_buf_impl_t *)dbuf;
+ bp = &bps[i];
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_SPILL_BLKID);
+
+ if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+ }
+
+ for (i = 0; i < numbufs; i++) {
+ dbuf = dbp[i];
+ db = (dmu_buf_impl_t *)dbuf;
+ bp = &bps[i];
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT(db->db_blkid != DMU_SPILL_BLKID);
+ ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
+
+ dmu_buf_will_clone(dbuf, tx);
+
+ mutex_enter(&db->db_mtx);
+
+ dr = list_head(&db->db_dirty_records);
+ VERIFY(dr != NULL);
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ dl = &dr->dt.dl;
+ dl->dr_overridden_by = *bp;
+ if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
+ if (!BP_IS_EMBEDDED(bp)) {
+ BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
+ BP_GET_BIRTH(bp));
+ } else {
+ BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
+ dr->dr_txg);
+ }
+ }
+ dl->dr_brtwrite = B_TRUE;
+ dl->dr_override_state = DR_OVERRIDDEN;
+
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * When data in embedded into BP there is no need to create
+ * BRT entry as there is no data block. Just copy the BP as
+ * it contains the data.
+ */
+ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+ brt_pending_add(spa, bp, tx);
+ }
+ }
+out:
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+ return (error);
+}
+
void
__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
@@ -2272,10 +2534,10 @@ byteswap_uint16_array(void *vbuf, size_t size)
buf[i] = BSWAP_16(buf[i]);
}
-/* ARGSUSED */
void
byteswap_uint8_array(void *vbuf, size_t size)
{
+ (void) vbuf, (void) size;
}
void
@@ -2313,6 +2575,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
EXPORT_SYMBOL(dmu_buf_rele_array);
EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_prefetch_by_dnode);
+EXPORT_SYMBOL(dmu_prefetch_dnode);
EXPORT_SYMBOL(dmu_free_range);
EXPORT_SYMBOL(dmu_free_long_range);
EXPORT_SYMBOL(dmu_free_long_object);
@@ -2341,16 +2605,15 @@ EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
EXPORT_SYMBOL(dmu_buf_hold);
EXPORT_SYMBOL(dmu_ot);
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
"Enable NOP writes");
-ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
"Percentage of dirtied blocks from frees in one TXG");
ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
"Enable forcing txg sync to find holes");
-ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW,
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
"Limit one prefetch call to this size");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c
index a573a2e1bd41..0def0956beb8 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_diff.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -108,15 +108,15 @@ report_dnode(dmu_diffarg_t *da, uint64_t object, dnode_phys_t *dnp)
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
-/* ARGSUSED */
static int
diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
+ (void) zilog;
dmu_diffarg_t *da = arg;
int err = 0;
- if (issig(JUSTLOOKING) && issig(FORREAL))
+ if (issig())
return (SET_ERROR(EINTR));
if (zb->zb_level == ZB_DNODE_LEVEL ||
diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c
index 12cdbd68b104..56986ea43446 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_object.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_object.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -41,12 +41,12 @@
* determined to be the lowest value that eliminates the measurable effect
* of lock contention from this code path.
*/
-int dmu_object_alloc_chunk_shift = 7;
+uint_t dmu_object_alloc_chunk_shift = 7;
static uint64_t
dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
- int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+ int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
{
uint64_t object;
uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
@@ -55,7 +55,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
int dn_slots = dnodesize >> DNODE_SHIFT;
boolean_t restarted = B_FALSE;
uint64_t *cpuobj = NULL;
- int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
int error;
cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
@@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
* is not suitably aligned.
*/
os->os_obj_next_chunk =
- P2ALIGN(object, dnodes_per_chunk) +
+ P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +
dnodes_per_chunk;
(void) atomic_swap_64(cpuobj, object);
mutex_exit(&os->os_obj_lock);
@@ -255,7 +255,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
uint64_t
dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
- int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+ int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
{
return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
@@ -409,6 +409,8 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
* hand off to dnode_next_offset() for further scanning.
*/
while (i <= last_obj) {
+ if (i == 0)
+ return (SET_ERROR(ESRCH));
error = dmu_object_info(os, i, &doi);
if (error == ENOENT) {
if (hole) {
@@ -518,6 +520,6 @@ EXPORT_SYMBOL(dmu_object_zapify);
EXPORT_SYMBOL(dmu_object_free_zapified);
/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,
"CPU-specific allocator grabs 2^N objects at once");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index b30a9d619034..8f4fefa4f4dd 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -32,6 +32,7 @@
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -78,16 +79,16 @@ krwlock_t os_lock;
* datasets.
* Default is 4 times the number of leaf vdevs.
*/
-int dmu_find_threads = 0;
+static const int dmu_find_threads = 0;
/*
* Backfill lower metadnode objects after this many have been freed.
* Backfilling negatively impacts object creation rates, so only do it
* if there are enough holes to fill.
*/
-int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
+static const int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
-static char *upgrade_tag = "upgrade_tag";
+static const char *upgrade_tag = "upgrade_tag";
static void dmu_objset_find_dp_cb(void *arg);
@@ -263,6 +264,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
}
static void
+prefetch_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE ||
+ newval == ZFS_PREFETCH_METADATA);
+ os->os_prefetch = newval;
+}
+
+static void
sync_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -287,7 +301,9 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval)
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
- newval == ZFS_REDUNDANT_METADATA_MOST);
+ newval == ZFS_REDUNDANT_METADATA_MOST ||
+ newval == ZFS_REDUNDANT_METADATA_SOME ||
+ newval == ZFS_REDUNDANT_METADATA_NONE);
os->os_redundant_metadata = newval;
}
@@ -384,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj)
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
/*
- * The low 6 bits of the pointer don't have much entropy, because
- * the objset_t is larger than 2^6 bytes long.
+ * The lower 11 bits of the pointer don't have much entropy, because
+ * the objset_t is more than 1KB long and so likely aligned to 2KB.
*/
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
@@ -416,28 +432,28 @@ dnode_multilist_index_func(multilist_t *ml, void *obj)
static inline boolean_t
dmu_os_is_l2cacheable(objset_t *os)
{
- vdev_t *vd = NULL;
- zfs_cache_type_t cache = os->os_secondary_cache;
- blkptr_t *bp = os->os_rootbp;
+ if (os->os_secondary_cache == ZFS_CACHE_ALL ||
+ os->os_secondary_cache == ZFS_CACHE_METADATA) {
+ if (l2arc_exclude_special == 0)
+ return (B_TRUE);
- if (bp != NULL && !BP_IS_HOLE(bp)) {
+ blkptr_t *bp = os->os_rootbp;
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (B_FALSE);
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
vdev_t *rvd = os->os_spa->spa_root_vdev;
+ vdev_t *vd = NULL;
if (vdev < rvd->vdev_children)
vd = rvd->vdev_child[vdev];
- if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) {
- if (vd == NULL)
- return (B_TRUE);
+ if (vd == NULL)
+ return (B_TRUE);
- if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
- vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
- l2arc_exclude_special == 0)
- return (B_TRUE);
- }
+ if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+ vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+ return (B_TRUE);
}
-
return (B_FALSE);
}
@@ -479,7 +495,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
arc_flags_t aflags = ARC_FLAG_WAIT;
zbookmark_phys_t zb;
int size;
- enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
@@ -516,8 +532,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
if (arc_buf_size(os->os_phys_buf) < size) {
arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
ARC_BUFC_METADATA, size);
- bzero(buf->b_data, size);
- bcopy(os->os_phys_buf->b_data, buf->b_data,
+ memset(buf->b_data, 0, size);
+ memcpy(buf->b_data, os->os_phys_buf->b_data,
arc_buf_size(os->os_phys_buf));
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
os->os_phys_buf = buf;
@@ -531,7 +547,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
ARC_BUFC_METADATA, size);
os->os_phys = os->os_phys_buf->b_data;
- bzero(os->os_phys, size);
+ memset(os->os_phys, 0, size);
}
/*
* These properties will be filled in by the logic in zfs_get_zplprop()
@@ -559,6 +575,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
secondary_cache_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_PREFETCH),
+ prefetch_changed_cb, os);
+ }
if (!ds->ds_is_snapshot) {
if (err == 0) {
err = dsl_prop_register(ds,
@@ -632,6 +653,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
os->os_dnodesize = DNODE_MIN_SIZE;
+ os->os_prefetch = ZFS_PREFETCH_ALL;
}
if (ds == NULL || !ds->ds_is_snapshot)
@@ -714,7 +736,7 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
* can be held at a time.
*/
int
-dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
+dmu_objset_hold_flags(const char *name, boolean_t decrypt, const void *tag,
objset_t **osp)
{
dsl_pool_t *dp;
@@ -742,18 +764,18 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
}
int
-dmu_objset_hold(const char *name, void *tag, objset_t **osp)
+dmu_objset_hold(const char *name, const void *tag, objset_t **osp)
{
return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
}
static int
dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
- boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+ boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
{
- int err;
+ (void) tag;
- err = dmu_objset_from_ds(ds, osp);
+ int err = dmu_objset_from_ds(ds, osp);
if (err != 0) {
return (err);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
@@ -789,7 +811,7 @@ dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
*/
int
dmu_objset_own(const char *name, dmu_objset_type_t type,
- boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+ boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
@@ -834,7 +856,7 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
int
dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
- boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+ boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
{
dsl_dataset_t *ds;
int err;
@@ -855,7 +877,7 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
}
void
-dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
+dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, const void *tag)
{
ds_hold_flags_t flags;
dsl_pool_t *dp = dmu_objset_pool(os);
@@ -866,7 +888,7 @@ dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
}
void
-dmu_objset_rele(objset_t *os, void *tag)
+dmu_objset_rele(objset_t *os, const void *tag)
{
dmu_objset_rele_flags(os, B_FALSE, tag);
}
@@ -884,7 +906,7 @@ dmu_objset_rele(objset_t *os, void *tag)
*/
void
dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
- boolean_t decrypt, void *tag)
+ boolean_t decrypt, const void *tag)
{
dsl_pool_t *dp;
char name[ZFS_MAX_DATASET_NAME_LEN];
@@ -904,7 +926,7 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
}
void
-dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
+dmu_objset_disown(objset_t *os, boolean_t decrypt, const void *tag)
{
ds_hold_flags_t flags;
@@ -1118,12 +1140,14 @@ dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
(!os->os_encrypted || !dmu_objset_is_receiving(os))) {
os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
if (dmu_objset_userobjused_enabled(os)) {
+ ASSERT3P(ds, !=, NULL);
ds->ds_feature_activation[
SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
os->os_phys->os_flags |=
OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
}
if (dmu_objset_projectquota_enabled(os)) {
+ ASSERT3P(ds, !=, NULL);
ds->ds_feature_activation[
SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
os->os_phys->os_flags |=
@@ -1157,7 +1181,6 @@ typedef struct dmu_objset_create_arg {
dsl_crypto_params_t *doca_dcp;
} dmu_objset_create_arg_t;
-/*ARGSUSED*/
static int
dmu_objset_create_check(void *arg, dmu_tx_t *tx)
{
@@ -1299,6 +1322,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
ASSERT3P(ds->ds_key_mapping, !=, NULL);
key_mapping_rele(spa, ds->ds_key_mapping, ds);
dsl_dataset_sync_done(ds, tx);
+ dmu_buf_rele(ds->ds_dbuf, ds);
}
mutex_enter(&ds->ds_lock);
@@ -1353,7 +1377,6 @@ typedef struct dmu_objset_clone_arg {
proc_t *doca_proc;
} dmu_objset_clone_arg_t;
-/*ARGSUSED*/
static int
dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
{
@@ -1565,10 +1588,10 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
}
}
-/* ARGSUSED */
static void
dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
+ (void) abuf;
blkptr_t *bp = zio->io_bp;
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
@@ -1596,10 +1619,10 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
}
-/* ARGSUSED */
static void
dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
{
+ (void) abuf;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
objset_t *os = arg;
@@ -1616,28 +1639,92 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
kmem_free(bp, sizeof (*bp));
}
+typedef struct sync_objset_arg {
+ zio_t *soa_zio;
+ objset_t *soa_os;
+ dmu_tx_t *soa_tx;
+ kmutex_t soa_mutex;
+ int soa_count;
+ taskq_ent_t soa_tq_ent;
+} sync_objset_arg_t;
+
typedef struct sync_dnodes_arg {
- multilist_t *sda_list;
- int sda_sublist_idx;
- multilist_t *sda_newlist;
- dmu_tx_t *sda_tx;
+ multilist_t *sda_list;
+ int sda_sublist_idx;
+ multilist_t *sda_newlist;
+ sync_objset_arg_t *sda_soa;
} sync_dnodes_arg_t;
+static void sync_meta_dnode_task(void *arg);
+
static void
sync_dnodes_task(void *arg)
{
sync_dnodes_arg_t *sda = arg;
+ sync_objset_arg_t *soa = sda->sda_soa;
+ objset_t *os = soa->soa_os;
+ uint_t allocator = spa_acq_allocator(os->os_spa);
multilist_sublist_t *ms =
- multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+ multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
- dmu_objset_sync_dnodes(ms, sda->sda_tx);
+ dmu_objset_sync_dnodes(ms, soa->soa_tx);
multilist_sublist_unlock(ms);
+ spa_rel_allocator(os->os_spa, allocator);
kmem_free(sda, sizeof (*sda));
+
+ mutex_enter(&soa->soa_mutex);
+ ASSERT(soa->soa_count != 0);
+ if (--soa->soa_count != 0) {
+ mutex_exit(&soa->soa_mutex);
+ return;
+ }
+ mutex_exit(&soa->soa_mutex);
+
+ taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+ sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
}
+/*
+ * Issue the zio_nowait() for all dirty record zios on the meta dnode,
+ * then trigger the callback for the zil_sync. This runs once for each
+ * objset, only after any/all sublists in the objset have been synced.
+ */
+static void
+sync_meta_dnode_task(void *arg)
+{
+ sync_objset_arg_t *soa = arg;
+ objset_t *os = soa->soa_os;
+ dmu_tx_t *tx = soa->soa_tx;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ dbuf_dirty_record_t *dr;
+
+ ASSERT0(soa->soa_count);
+
+ list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+ while ((dr = list_remove_head(list)) != NULL) {
+ ASSERT0(dr->dr_dbuf->db_level);
+ zio_nowait(dr->dr_zio);
+ }
+
+ /* Enable dnode backfill if enough objects have been freed. */
+ if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+ os->os_rescan_dnodes = B_TRUE;
+ os->os_freed_dnodes = 0;
+ }
+
+ /*
+ * Free intent log blocks up to this tx.
+ */
+ zil_sync(os->os_zil, tx);
+ os->os_phys->os_zil_header = os->os_zil_header;
+ zio_nowait(soa->soa_zio);
+
+ mutex_destroy(&soa->soa_mutex);
+ kmem_free(soa, sizeof (*soa));
+}
/* called from dsl */
void
@@ -1647,8 +1734,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
zbookmark_phys_t zb;
zio_prop_t zp;
zio_t *zio;
- list_t *list;
- dbuf_dirty_record_t *dr;
int num_sublists;
multilist_t *ml;
blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
@@ -1693,8 +1778,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
}
zio = arc_write(pio, os->os_spa, tx->tx_txg,
- blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os),
- &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+ blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
+ &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
@@ -1735,40 +1820,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
offsetof(dnode_t, dn_dirty_link[txgoff]));
}
+ /*
+ * zio_nowait(zio) is done after any/all sublist and meta dnode
+ * zios have been nowaited, and the zil_sync() has been performed.
+ * The soa is freed at the end of sync_meta_dnode_task.
+ */
+ sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
+ soa->soa_zio = zio;
+ soa->soa_os = os;
+ soa->soa_tx = tx;
+ taskq_init_ent(&soa->soa_tq_ent);
+ mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);
+
ml = &os->os_dirty_dnodes[txgoff];
- num_sublists = multilist_get_num_sublists(ml);
+ soa->soa_count = num_sublists = multilist_get_num_sublists(ml);
+
for (int i = 0; i < num_sublists; i++) {
if (multilist_sublist_is_empty_idx(ml, i))
- continue;
- sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
- sda->sda_list = ml;
- sda->sda_sublist_idx = i;
- sda->sda_tx = tx;
- (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
- sync_dnodes_task, sda, 0);
- /* callback frees sda */
+ soa->soa_count--;
}
- taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
- list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
- while ((dr = list_head(list)) != NULL) {
- ASSERT0(dr->dr_dbuf->db_level);
- list_remove(list, dr);
- zio_nowait(dr->dr_zio);
- }
-
- /* Enable dnode backfill if enough objects have been freed. */
- if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
- os->os_rescan_dnodes = B_TRUE;
- os->os_freed_dnodes = 0;
+ if (soa->soa_count == 0) {
+ taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
+ sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
+ } else {
+ /*
+ * Sync sublists in parallel. The last to finish
+ * (i.e., when soa->soa_count reaches zero) must
+ * dispatch sync_meta_dnode_task.
+ */
+ for (int i = 0; i < num_sublists; i++) {
+ if (multilist_sublist_is_empty_idx(ml, i))
+ continue;
+ sync_dnodes_arg_t *sda =
+ kmem_alloc(sizeof (*sda), KM_SLEEP);
+ sda->sda_list = ml;
+ sda->sda_sublist_idx = i;
+ sda->sda_soa = soa;
+ (void) taskq_dispatch(
+ dmu_objset_pool(os)->dp_sync_taskq,
+ sync_dnodes_task, sda, 0);
+ /* sync_dnodes_task frees sda */
+ }
}
-
- /*
- * Free intent log blocks up to this tx.
- */
- zil_sync(os->os_zil, tx);
- os->os_phys->os_zil_header = os->os_zil_header;
- zio_nowait(zio);
}
boolean_t
@@ -1984,8 +2078,8 @@ userquota_updates_task(void *arg)
dnode_t *dn;
userquota_cache_t cache = { { 0 } };
- multilist_sublist_t *list =
- multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+ multilist_sublist_t *list = multilist_sublist_lock_idx(
+ &os->os_synced_dnodes, uua->uua_sublist_idx);
ASSERT(multilist_sublist_head(list) == NULL ||
dmu_objset_userused_enabled(os));
@@ -2067,8 +2161,8 @@ dnode_rele_task(void *arg)
userquota_updates_arg_t *uua = arg;
objset_t *os = uua->uua_os;
- multilist_sublist_t *list =
- multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+ multilist_sublist_t *list = multilist_sublist_lock_idx(
+ &os->os_synced_dnodes, uua->uua_sublist_idx);
dnode_t *dn;
while ((dn = multilist_sublist_head(list)) != NULL) {
@@ -2343,7 +2437,7 @@ dmu_objset_space_upgrade(objset_t *os)
if (err != 0)
return (err);
- if (issig(JUSTLOOKING) && issig(FORREAL))
+ if (issig())
return (SET_ERROR(EINTR));
objerr = dmu_bonus_hold(os, obj, FTAG, &db);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c
index 0ec46bdb4f47..0119191d7920 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_recv.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -27,8 +27,12 @@
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2022 Axcient.
*/
+#include <sys/arc.h>
+#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_send.h>
@@ -64,12 +68,19 @@
#endif
#include <sys/zfs_file.h>
-int zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
-int zfs_recv_queue_ff = 20;
-int zfs_recv_write_batch_size = 1024 * 1024;
+static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
+static uint_t zfs_recv_queue_ff = 20;
+static uint_t zfs_recv_write_batch_size = 1024 * 1024;
+static int zfs_recv_best_effort_corrective = 0;
-static char *dmu_recv_tag = "dmu_recv_tag";
-const char *recv_clone_name = "%recv";
+static const void *const dmu_recv_tag = "dmu_recv_tag";
+const char *const recv_clone_name = "%recv";
+
+typedef enum {
+ ORNS_NO,
+ ORNS_YES,
+ ORNS_MAYBE
+} or_need_sync_t;
static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
void *buf);
@@ -102,6 +113,8 @@ struct receive_writer_arg {
boolean_t done;
int err;
+ const char *tofs;
+ boolean_t heal;
boolean_t resumable;
boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */
boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
@@ -121,6 +134,10 @@ struct receive_writer_arg {
uint8_t or_iv[ZIO_DATA_IV_LEN];
uint8_t or_mac[ZIO_DATA_MAC_LEN];
boolean_t or_byteorder;
+ zio_t *heal_pio;
+
+ /* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */
+ or_need_sync_t or_need_sync;
};
typedef struct dmu_recv_begin_arg {
@@ -343,9 +360,10 @@ static int
recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
uint64_t fromguid, uint64_t featureflags)
{
- uint64_t val;
+ uint64_t obj;
uint64_t children;
int error;
+ dsl_dataset_t *snap;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
@@ -354,7 +372,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
/* Temporary clone name must not exist. */
error = zap_lookup(dp->dp_meta_objset,
dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
- 8, 1, &val);
+ 8, 1, &obj);
if (error != ENOENT)
return (error == 0 ? SET_ERROR(EBUSY) : error);
@@ -362,12 +380,16 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
if (dsl_dataset_has_resume_receive_state(ds))
return (SET_ERROR(EBUSY));
- /* New snapshot name must not exist. */
+ /* New snapshot name must not exist if we're not healing it. */
error = zap_lookup(dp->dp_meta_objset,
dsl_dataset_phys(ds)->ds_snapnames_zapobj,
- drba->drba_cookie->drc_tosnap, 8, 1, &val);
- if (error != ENOENT)
+ drba->drba_cookie->drc_tosnap, 8, 1, &obj);
+ if (drba->drba_cookie->drc_heal) {
+ if (error != 0)
+ return (error);
+ } else if (error != ENOENT) {
return (error == 0 ? SET_ERROR(EEXIST) : error);
+ }
/* Must not have children if receiving a ZVOL. */
error = zap_count(dp->dp_meta_objset,
@@ -392,8 +414,40 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
if (error != 0)
return (error);
- if (fromguid != 0) {
- dsl_dataset_t *snap;
+ if (drba->drba_cookie->drc_heal) {
+ /* Encryption is incompatible with embedded data. */
+ if (encrypted && embed)
+ return (SET_ERROR(EINVAL));
+
+ /* Healing is not supported when in 'force' mode. */
+ if (drba->drba_cookie->drc_force)
+ return (SET_ERROR(EINVAL));
+
+ /* Must have keys loaded if doing encrypted non-raw recv. */
+ if (encrypted && !raw) {
+ if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object,
+ NULL, NULL) != 0)
+ return (SET_ERROR(EACCES));
+ }
+
+ error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap);
+ if (error != 0)
+ return (error);
+
+ /*
+ * When not doing best effort corrective recv healing can only
+ * be done if the send stream is for the same snapshot as the
+ * one we are trying to heal.
+ */
+ if (zfs_recv_best_effort_corrective == 0 &&
+ drba->drba_cookie->drc_drrb->drr_toguid !=
+ dsl_dataset_phys(snap)->ds_guid) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ dsl_dataset_rele(snap, FTAG);
+ } else if (fromguid != 0) {
+ /* Sanity check the incremental recv */
uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
/* Can't perform a raw receive on top of a non-raw receive */
@@ -459,7 +513,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
dsl_dataset_rele(snap, FTAG);
} else {
- /* if full, then must be forced */
+ /* If full and not healing then must be forced. */
if (!drba->drba_cookie->drc_force)
return (SET_ERROR(EEXIST));
@@ -597,7 +651,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
if (!(flags & DRR_FLAG_SPILL_BLOCK))
return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
} else {
- dsflags |= DS_HOLD_FLAG_DECRYPT;
+ /*
+ * We support unencrypted datasets below encrypted ones now,
+ * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing
+ * with a dataset we may encrypt.
+ */
+ if (drba->drba_dcp == NULL ||
+ drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) {
+ dsflags |= DS_HOLD_FLAG_DECRYPT;
+ }
}
error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
@@ -618,6 +680,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
char buf[ZFS_MAX_DATASET_NAME_LEN];
objset_t *os;
+ /* healing recv must be done "into" an existing snapshot */
+ if (drba->drba_cookie->drc_heal == B_TRUE)
+ return (SET_ERROR(ENOTSUP));
+
/*
* If it's a non-clone incremental, we are missing the
* target fs, so fail the recv.
@@ -799,7 +865,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
if (error == 0) {
- /* create temporary clone */
+ /* Create temporary clone unless we're doing corrective recv */
dsl_dataset_t *snap = NULL;
if (drba->drba_cookie->drc_fromsnapobj != 0) {
@@ -807,8 +873,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
ASSERT3P(dcp, ==, NULL);
}
- dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
- snap, crflags, drba->drba_cred, dcp, tx);
+ if (drc->drc_heal) {
+ /* When healing we want to use the provided snapshot */
+ VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap,
+ &dsobj));
+ } else {
+ dsobj = dsl_dataset_create_sync(ds->ds_dir,
+ recv_clone_name, snap, crflags, drba->drba_cred,
+ dcp, tx);
+ }
if (drba->drba_cookie->drc_fromsnapobj != 0)
dsl_dataset_rele(snap, FTAG);
dsl_dataset_rele_flags(ds, dsflags, FTAG);
@@ -925,7 +998,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
*/
rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
- (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
+ (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
+ !drc->drc_heal) {
(void) dmu_objset_create_impl(dp->dp_spa,
newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
}
@@ -981,13 +1055,24 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
dsflags |= DS_HOLD_FLAG_DECRYPT;
}
+ boolean_t recvexist = B_TRUE;
if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
/* %recv does not exist; continue in tofs */
+ recvexist = B_FALSE;
error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
if (error != 0)
return (error);
}
+ /*
+ * Resume of full/newfs recv on existing dataset should be done with
+ * force flag
+ */
+ if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) {
+ dsl_dataset_rele_flags(ds, dsflags, FTAG);
+ return (SET_ERROR(ZFS_ERR_RESUME_EXISTS));
+ }
+
/* check that ds is marked inconsistent */
if (!DS_IS_INCONSISTENT(ds)) {
dsl_dataset_rele_flags(ds, dsflags, FTAG);
@@ -1132,20 +1217,22 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
* succeeds; otherwise we will leak the holds on the datasets.
*/
int
-dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
- boolean_t force, boolean_t resumable, nvlist_t *localprops,
- nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc,
- zfs_file_t *fp, offset_t *voffp)
+dmu_recv_begin(const char *tofs, const char *tosnap,
+ dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal,
+ boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args,
+ const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp,
+ offset_t *voffp)
{
dmu_recv_begin_arg_t drba = { 0 };
- int err;
+ int err = 0;
- bzero(drc, sizeof (dmu_recv_cookie_t));
+ memset(drc, 0, sizeof (dmu_recv_cookie_t));
drc->drc_drr_begin = drr_begin;
drc->drc_drrb = &drr_begin->drr_u.drr_begin;
drc->drc_tosnap = tosnap;
drc->drc_tofs = tofs;
drc->drc_force = force;
+ drc->drc_heal = heal;
drc->drc_resumable = resumable;
drc->drc_cred = CRED();
drc->drc_proc = curproc;
@@ -1169,20 +1256,36 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
- void *payload = NULL;
- if (payloadlen != 0)
- payload = kmem_alloc(payloadlen, KM_SLEEP);
- err = receive_read_payload_and_next_header(drc, payloadlen,
- payload);
- if (err != 0) {
- kmem_free(payload, payloadlen);
- return (err);
- }
+ /*
+ * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace
+ * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard
+ * upper limit. Systems with less than 1GB of RAM will see a lower
+ * limit from `arc_all_memory() / 4`.
+ */
+ if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4)))
+ return (E2BIG);
+
+
if (payloadlen != 0) {
+ void *payload = vmem_alloc(payloadlen, KM_SLEEP);
+ /*
+ * For compatibility with recursive send streams, we don't do
+ * this here if the stream could be part of a package. Instead,
+ * we'll do it in dmu_recv_stream. If we pull the next header
+ * too early, and it's the END record, we break the `recv_skip`
+ * logic.
+ */
+
+ err = receive_read_payload_and_next_header(drc, payloadlen,
+ payload);
+ if (err != 0) {
+ vmem_free(payload, payloadlen);
+ return (err);
+ }
err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
KM_SLEEP);
- kmem_free(payload, payloadlen);
+ vmem_free(payload, payloadlen);
if (err != 0) {
kmem_free(drc->drc_next_rrd,
sizeof (*drc->drc_next_rrd));
@@ -1203,7 +1306,6 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
&drba, 5, ZFS_SPACE_CHECK_NORMAL);
} else {
-
/*
* For non-raw, non-incremental, non-resuming receives the
* user can specify encryption parameters on the command line
@@ -1236,6 +1338,186 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
return (err);
}
+/*
+ * Holds data need for corrective recv callback
+ */
+typedef struct cr_cb_data {
+ uint64_t size;
+ zbookmark_phys_t zb;
+ spa_t *spa;
+} cr_cb_data_t;
+
+static void
+corrective_read_done(zio_t *zio)
+{
+ cr_cb_data_t *data = zio->io_private;
+ /* Corruption corrected; update error log if needed */
+ if (zio->io_error == 0) {
+ spa_remove_error(data->spa, &data->zb,
+ BP_GET_LOGICAL_BIRTH(zio->io_bp));
+ }
+ kmem_free(data, sizeof (cr_cb_data_t));
+ abd_free(zio->io_abd);
+}
+
+/*
+ * zio_rewrite the data pointed to by bp with the data from the rrd's abd.
+ */
+static int
+do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
+ struct receive_record_arg *rrd, blkptr_t *bp)
+{
+ int err;
+ zio_t *io;
+ zbookmark_phys_t zb;
+ dnode_t *dn;
+ abd_t *abd = rrd->abd;
+ zio_cksum_t bp_cksum = bp->blk_cksum;
+ zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_CANFAIL;
+
+ if (rwa->raw)
+ flags |= ZIO_FLAG_RAW;
+
+ err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0,
+ dbuf_whichblock(dn, 0, drrw->drr_offset));
+ dnode_rele(dn, FTAG);
+
+ if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) {
+ /* Decompress the stream data */
+ abd_t *dabd = abd_alloc_linear(
+ drrw->drr_logical_size, B_FALSE);
+ err = zio_decompress_data(drrw->drr_compressiontype,
+ abd, abd_to_buf(dabd), abd_get_size(abd),
+ abd_get_size(dabd), NULL);
+
+ if (err != 0) {
+ abd_free(dabd);
+ return (err);
+ }
+ /* Swap in the newly decompressed data into the abd */
+ abd_free(abd);
+ abd = dabd;
+ }
+
+ if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+ /* Recompress the data */
+ abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
+ B_FALSE);
+ void *buf = abd_to_buf(cabd);
+ uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
+ abd, &buf, abd_get_size(abd),
+ rwa->os->os_complevel);
+ abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
+ /* Swap in newly compressed data into the abd */
+ abd_free(abd);
+ abd = cabd;
+ flags |= ZIO_FLAG_RAW_COMPRESS;
+ }
+
+ /*
+ * The stream is not encrypted but the data on-disk is.
+ * We need to re-encrypt the buf using the same
+ * encryption type, salt, iv, and mac that was used to encrypt
+ * the block previosly.
+ */
+ if (!rwa->raw && BP_USES_CRYPT(bp)) {
+ dsl_dataset_t *ds;
+ dsl_crypto_key_t *dck = NULL;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+ dsl_pool_t *dp = dmu_objset_pool(rwa->os);
+ abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE);
+
+ zio_crypt_decode_params_bp(bp, salt, iv);
+ zio_crypt_decode_mac_bp(bp, mac);
+
+ dsl_pool_config_enter(dp, FTAG);
+ err = dsl_dataset_hold_flags(dp, rwa->tofs,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+ if (err != 0) {
+ dsl_pool_config_exit(dp, FTAG);
+ abd_free(eabd);
+ return (SET_ERROR(EACCES));
+ }
+
+ /* Look up the key from the spa's keystore */
+ err = spa_keystore_lookup_key(rwa->os->os_spa,
+ zb.zb_objset, FTAG, &dck);
+ if (err != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
+ FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ abd_free(eabd);
+ return (SET_ERROR(EACCES));
+ }
+
+ err = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
+ BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv,
+ mac, abd_get_size(abd), abd, eabd, &no_crypt);
+
+ spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG);
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+
+ ASSERT0(no_crypt);
+ if (err != 0) {
+ abd_free(eabd);
+ return (err);
+ }
+ /* Swap in the newly encrypted data into the abd */
+ abd_free(abd);
+ abd = eabd;
+
+ /*
+ * We want to prevent zio_rewrite() from trying to
+ * encrypt the data again
+ */
+ flags |= ZIO_FLAG_RAW_ENCRYPT;
+ }
+ rrd->abd = abd;
+
+ io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
+ abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
+ &zb);
+
+ ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
+ abd_get_size(abd) == BP_GET_PSIZE(bp));
+
+ /* compute new bp checksum value and make sure it matches the old one */
+ zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd));
+ if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) {
+ zio_destroy(io);
+ if (zfs_recv_best_effort_corrective != 0)
+ return (0);
+ return (SET_ERROR(ECKSUM));
+ }
+
+ /* Correct the corruption in place */
+ err = zio_wait(io);
+ if (err == 0) {
+ cr_cb_data_t *cb_data =
+ kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP);
+ cb_data->spa = rwa->os->os_spa;
+ cb_data->size = drrw->drr_logical_size;
+ cb_data->zb = zb;
+ /* Test if healing worked by re-reading the bp */
+ err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp,
+ abd_alloc_for_io(drrw->drr_logical_size, B_FALSE),
+ drrw->drr_logical_size, corrective_read_done,
+ cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL));
+ }
+ if (err != 0 && zfs_recv_best_effort_corrective != 0)
+ err = 0;
+
+ return (err);
+}
+
static int
receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
{
@@ -1249,11 +1531,11 @@ receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
(drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
while (done < len) {
- ssize_t resid;
+ ssize_t resid = len - done;
zfs_file_t *fp = drc->drc_fp;
int err = zfs_file_read(fp, (char *)buf + done,
len - done, &resid);
- if (resid == len - done) {
+ if (err == 0 && resid == len - done) {
/*
* Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates
* that the receive was interrupted and can
@@ -1516,17 +1798,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,
}
/*
- * The dmu does not currently support decreasing nlevels
- * or changing the number of dnode slots on an object. For
- * non-raw sends, this does not matter and the new object
- * can just use the previous one's nlevels. For raw sends,
- * however, the structure of the received dnode (including
- * nlevels and dnode slots) must match that of the send
- * side. Therefore, instead of using dmu_object_reclaim(),
- * we must free the object completely and call
- * dmu_object_claim_dnsize() instead.
+ * The dmu does not currently support decreasing nlevels or changing
+ * indirect block size if there is already one, same as changing the
+ * number of of dnode slots on an object. For non-raw sends this
+ * does not matter and the new object can just use the previous one's
+ * parameters. For raw sends, however, the structure of the received
+ * dnode (including indirects and dnode slots) must match that of the
+ * send side. Therefore, instead of using dmu_object_reclaim(), we
+ * must free the object completely and call dmu_object_claim_dnsize()
+ * instead.
*/
- if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+ if ((rwa->raw && ((doi->doi_indirection > 1 &&
+ indblksz != doi->doi_metadata_block_size) ||
+ drro->drr_nlevels < doi->doi_indirection)) ||
dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_object(rwa->os, drro->drr_object);
if (err != 0)
@@ -1634,6 +1918,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (err == 0) {
err = receive_handle_existing_object(rwa, drro, &doi, data,
&object_to_hold, &new_blksz);
+ if (err != 0)
+ return (err);
} else if (err == EEXIST) {
/*
* The object requested is currently an interior slot of a
@@ -1650,10 +1936,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
/* object was freed and we are about to allocate a new one */
object_to_hold = DMU_NEW_OBJECT;
} else {
+ /*
+ * If the only record in this range so far was DRR_FREEOBJECTS
+ * with at least one actually freed object, it's possible that
+ * the block will now be converted to a hole. We need to wait
+ * for the txg to sync to prevent races.
+ */
+ if (rwa->or_need_sync == ORNS_YES)
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+
/* object is free and we are about to allocate a new one */
object_to_hold = DMU_NEW_OBJECT;
}
+ /* Only relevant for the first object in the range */
+ rwa->or_need_sync = ORNS_NO;
+
/*
* If this is a multi-slot dnode there is a chance that this
* object will expand into a slot that is already used by
@@ -1800,7 +2098,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
- bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro));
+ memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro));
/*
* Raw bonus buffers have their byteorder determined by the
@@ -1815,12 +2113,21 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
dmu_buf_rele(db, FTAG);
dnode_rele(dn, FTAG);
}
+
+ /*
+ * If the receive fails, we want the resume stream to start with the
+ * same record that we last successfully received. There is no way to
+ * request resume from the object record, but we can benefit from the
+ * fact that sender always sends object record before anything else,
+ * after which it will "resend" data at offset 0 and resume normally.
+ */
+ save_resume_state(rwa, drro->drr_object, 0, tx);
+
dmu_tx_commit(tx);
return (0);
}
-/* ARGSUSED */
noinline static int
receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo)
@@ -1848,6 +2155,9 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0)
return (err);
+
+ if (rwa->or_need_sync == ORNS_MAYBE)
+ rwa->or_need_sync = ORNS_YES;
}
if (next_err != ESRCH)
return (next_err);
@@ -1931,10 +2241,10 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
if (err == 0)
abd_free(abd);
} else {
- zio_prop_t zp;
+ zio_prop_t zp = {0};
dmu_write_policy(rwa->os, dn, 0, 0, &zp);
- enum zio_flag zio_flags = 0;
+ zio_flag_t zio_flags = 0;
if (rwa->raw) {
zp.zp_encrypt = B_TRUE;
@@ -1942,11 +2252,11 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
!!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
rwa->byteswap;
- bcopy(drrw->drr_salt, zp.zp_salt,
+ memcpy(zp.zp_salt, drrw->drr_salt,
ZIO_DATA_SALT_LEN);
- bcopy(drrw->drr_iv, zp.zp_iv,
+ memcpy(zp.zp_iv, drrw->drr_iv,
ZIO_DATA_IV_LEN);
- bcopy(drrw->drr_mac, zp.zp_mac,
+ memcpy(zp.zp_mac, drrw->drr_mac,
ZIO_DATA_MAC_LEN);
if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
zp.zp_nopwrite = B_FALSE;
@@ -2043,6 +2353,53 @@ receive_process_write_record(struct receive_writer_arg *rwa,
!DMU_OT_IS_VALID(drrw->drr_type))
return (SET_ERROR(EINVAL));
+ if (rwa->heal) {
+ blkptr_t *bp;
+ dmu_buf_t *dbp;
+ int flags = DB_RF_CANFAIL;
+
+ if (rwa->raw)
+ flags |= DB_RF_NO_DECRYPT;
+
+ if (rwa->byteswap) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrw->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd),
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
+ }
+
+ err = dmu_buf_hold_noread(rwa->os, drrw->drr_object,
+ drrw->drr_offset, FTAG, &dbp);
+ if (err != 0)
+ return (err);
+
+ /* Try to read the object to see if it needs healing */
+ err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags);
+ /*
+ * We only try to heal when dbuf_read() returns a ECKSUMs.
+ * Other errors (even EIO) get returned to caller.
+ * EIO indicates that the device is not present/accessible,
+ * so writing to it will likely fail.
+ * If the block is healthy, we don't want to overwrite it
+ * unnecessarily.
+ */
+ if (err != ECKSUM) {
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+ }
+ /* Make sure the on-disk block and recv record sizes match */
+ if (drrw->drr_logical_size != dbp->db_size) {
+ err = ENOTSUP;
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+ }
+ /* Get the block pointer for the corrupted block */
+ bp = dmu_buf_get_blkptr(dbp);
+ err = do_corrective_recv(rwa, drrw, rrd, bp);
+ dmu_buf_rele(dbp, FTAG);
+ return (err);
+ }
+
/*
* For resuming to work, records must be in increasing order
* by (object, offset).
@@ -2183,7 +2540,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
* size of the provided arc_buf_t.
*/
if (db_spill->db_size != drrs->drr_length) {
- dmu_buf_will_fill(db_spill, tx);
+ dmu_buf_will_fill(db_spill, tx, B_FALSE);
VERIFY0(dbuf_spill_set_blksz(db_spill,
drrs->drr_length, tx));
}
@@ -2211,7 +2568,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
}
}
- bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs));
+ memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
abd_free(abd);
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
@@ -2222,7 +2579,6 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
return (0);
}
-/* ARGSUSED */
noinline static int
receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
{
@@ -2285,11 +2641,13 @@ receive_object_range(struct receive_writer_arg *rwa,
rwa->or_crypt_params_present = B_TRUE;
rwa->or_firstobj = drror->drr_firstobj;
rwa->or_numslots = drror->drr_numslots;
- bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN);
- bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN);
- bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);
+ memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN);
+ memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN);
+ memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN);
rwa->or_byteorder = byteorder;
+ rwa->or_need_sync = ORNS_MAYBE;
+
return (0);
}
@@ -2297,7 +2655,6 @@ receive_object_range(struct receive_writer_arg *rwa,
* Until we have the ability to redact large ranges of data efficiently, we
* process these records as frees.
*/
-/* ARGSUSED */
noinline static int
receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
{
@@ -2337,7 +2694,8 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
rrw_exit(&ds->ds_bp_rwlock, FTAG);
dsl_dataset_name(ds, name);
dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
- (void) dsl_destroy_head(name);
+ if (!drc->drc_heal)
+ (void) dsl_destroy_head(name);
}
}
@@ -2446,7 +2804,6 @@ receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
* numbers in the ignore list. In practice, we receive up to 32 object records
* before receiving write records, so the list can have up to 32 nodes in it.
*/
-/* ARGSUSED */
static void
receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
uint64_t length)
@@ -2699,7 +3056,19 @@ receive_process_record(struct receive_writer_arg *rwa,
ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
rwa->bytes_read = rrd->bytes_read;
- if (rrd->header.drr_type != DRR_WRITE) {
+ /* We can only heal write records; other ones get ignored */
+ if (rwa->heal && rrd->header.drr_type != DRR_WRITE) {
+ if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ } else if (rrd->payload != NULL) {
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ }
+ return (0);
+ }
+
+ if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) {
err = flush_write_batch(rwa);
if (err != 0) {
if (rrd->abd != NULL) {
@@ -2734,9 +3103,16 @@ receive_process_record(struct receive_writer_arg *rwa,
case DRR_WRITE:
{
err = receive_process_write_record(rwa, rrd);
- if (err != EAGAIN) {
+ if (rwa->heal) {
+ /*
+ * If healing - always free the abd after processing
+ */
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
+ } else if (err != EAGAIN) {
/*
- * On success, receive_process_write_record() returns
+ * On success, a non-healing
+ * receive_process_write_record() returns
* EAGAIN to indicate that we do not want to free
* the rrd or arc_buf.
*/
@@ -2798,7 +3174,7 @@ receive_process_record(struct receive_writer_arg *rwa,
* dmu_recv_stream's worker thread; pull records off the queue, and then call
* receive_process_record When we're done, signal the main thread and exit.
*/
-static void
+static __attribute__((noreturn)) void
receive_writer_thread(void *arg)
{
struct receive_writer_arg *rwa = arg;
@@ -2827,8 +3203,9 @@ receive_writer_thread(void *arg)
* EAGAIN indicates that this record has been saved (on
* raw->write_batch), and will be used again, so we don't
* free it.
+ * When healing data we always need to free the record.
*/
- if (err != EAGAIN) {
+ if (err != EAGAIN || rwa->heal) {
if (rwa->err == 0)
rwa->err = err;
kmem_free(rrd, sizeof (*rrd));
@@ -2836,10 +3213,13 @@ receive_writer_thread(void *arg)
}
kmem_free(rrd, sizeof (*rrd));
- int err = flush_write_batch(rwa);
- if (rwa->err == 0)
- rwa->err = err;
-
+ if (rwa->heal) {
+ zio_wait(rwa->heal_pio);
+ } else {
+ int err = flush_write_batch(rwa);
+ if (rwa->err == 0)
+ rwa->err = err;
+ }
mutex_enter(&rwa->mutex);
rwa->done = B_TRUE;
cv_signal(&rwa->cv);
@@ -2923,17 +3303,19 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
if (err != 0)
goto out;
- /*
- * If this is a new dataset we set the key immediately.
- * Otherwise we don't want to change the key until we
- * are sure the rest of the receive succeeded so we stash
- * the keynvl away until then.
- */
- err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
- drc->drc_ds->ds_object, drc->drc_fromsnapobj,
- drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
- if (err != 0)
- goto out;
+ if (!drc->drc_heal) {
+ /*
+ * If this is a new dataset we set the key immediately.
+ * Otherwise we don't want to change the key until we
+ * are sure the rest of the receive succeeded so we
+ * stash the keynvl away until then.
+ */
+ err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
+ drc->drc_ds->ds_object, drc->drc_fromsnapobj,
+ drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
+ if (err != 0)
+ goto out;
+ }
/* see comment in dmu_recv_end_sync() */
drc->drc_ivset_guid = 0;
@@ -2951,6 +3333,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
}
/*
+ * For compatibility with recursive send streams, we do this here,
+ * rather than in dmu_recv_begin. If we pull the next header too
+ * early, and it's the END record, we break the `recv_skip` logic.
+ */
+ if (drc->drc_drr_begin->drr_payloadlen == 0) {
+ err = receive_read_payload_and_next_header(drc, 0, NULL);
+ if (err != 0)
+ goto out;
+ }
+
+ /*
* If we failed before this point we will clean up any new resume
* state that was created. Now that we've gotten past the initial
* checks we are ok to retain that resume state.
@@ -2964,11 +3357,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
rwa->os = drc->drc_os;
rwa->byteswap = drc->drc_byteswap;
+ rwa->heal = drc->drc_heal;
+ rwa->tofs = drc->drc_tofs;
rwa->resumable = drc->drc_resumable;
rwa->raw = drc->drc_raw;
rwa->spill = drc->drc_spill;
rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
rwa->os->os_raw_receive = drc->drc_raw;
+ if (drc->drc_heal) {
+ rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL,
+ ZIO_FLAG_GODFATHER);
+ }
list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
offsetof(struct receive_record_arg, node.bqn_node));
@@ -2990,7 +3389,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
* stream, then we free drc->drc_rrd and exit.
*/
while (rwa->err == 0) {
- if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ if (issig()) {
err = SET_ERROR(EINTR);
break;
}
@@ -3104,7 +3503,9 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx)
ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
- if (!drc->drc_newfs) {
+ if (drc->drc_heal) {
+ error = 0;
+ } else if (!drc->drc_newfs) {
dsl_dataset_t *origin_head;
error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
@@ -3180,13 +3581,18 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
dmu_recv_cookie_t *drc = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
- uint64_t newsnapobj;
+ uint64_t newsnapobj = 0;
spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
tx, "snap=%s", drc->drc_tosnap);
drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
- if (!drc->drc_newfs) {
+ if (drc->drc_heal) {
+ if (drc->drc_keynvl != NULL) {
+ nvlist_free(drc->drc_keynvl);
+ drc->drc_keynvl = NULL;
+ }
+ } else if (!drc->drc_newfs) {
dsl_dataset_t *origin_head;
VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
@@ -3300,7 +3706,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
* tunable is set, in which case we will leave the newly-generated
* value.
*/
- if (drc->drc_raw && drc->drc_ivset_guid != 0) {
+ if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) {
dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
DMU_OT_DSL_DATASET, tx);
VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
@@ -3367,7 +3773,7 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
if (error != 0) {
dmu_recv_cleanup_ds(drc);
nvlist_free(drc->drc_keynvl);
- } else {
+ } else if (!drc->drc_heal) {
if (drc->drc_newfs) {
zvol_create_minor(drc->drc_tofs);
}
@@ -3389,13 +3795,15 @@ dmu_objset_is_receiving(objset_t *os)
os->os_dsl_dataset->ds_owner == dmu_recv_tag);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW,
"Maximum receive queue length");
-ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW,
"Receive queue fill fraction");
-ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW,
"Maximum amount of writes to batch into one transaction");
+
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW,
+ "Ignore errors during corrective receive");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c
index fdbdf7d6e868..1feba0ba83de 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_redact.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -40,13 +40,14 @@
* This controls the number of entries in the buffer the redaction_list_update
* synctask uses to buffer writes to the redaction list.
*/
-int redact_sync_bufsize = 1024;
+static const int redact_sync_bufsize = 1024;
/*
* Controls how often to update the redaction list when creating a redaction
* list.
*/
-uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */
+static const uint64_t redaction_list_update_interval_ns =
+ 1000 * 1000 * 1000ULL; /* 1s */
/*
* This tunable controls the length of the queues that zfs redact worker threads
@@ -56,7 +57,7 @@ uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */
* available IO resources, or the queues are consuming too much memory, this
* variable may need to be decreased.
*/
-int zfs_redact_queue_length = 1024 * 1024;
+static const int zfs_redact_queue_length = 1024 * 1024;
/*
* These tunables control the fill fraction of the queues by zfs redact. The
@@ -65,7 +66,7 @@ int zfs_redact_queue_length = 1024 * 1024;
* should be tuned down. If the queues empty before the signalled thread can
* catch up, then these should be tuned up.
*/
-uint64_t zfs_redact_queue_ff = 20;
+static const uint64_t zfs_redact_queue_ff = 20;
struct redact_record {
bqueue_node_t ln;
@@ -141,7 +142,7 @@ record_merge_enqueue(bqueue_t *q, struct redact_record **build,
{
if (new->eos_marker) {
if (*build != NULL)
- bqueue_enqueue(q, *build, sizeof (*build));
+ bqueue_enqueue(q, *build, sizeof (**build));
bqueue_enqueue_flush(q, new, sizeof (*new));
return;
}
@@ -249,11 +250,11 @@ zfs_get_deleteq(objset_t *os)
* Third, if there is a deleted object, we need to create a redaction record for
* all of the blocks in that object.
*/
-/*ARGSUSED*/
static int
redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
{
+ (void) spa, (void) zilog;
struct redact_thread_arg *rta = arg;
struct redact_record *record;
@@ -350,7 +351,7 @@ redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
return (0);
}
-static void
+static __attribute__((noreturn)) void
redact_traverse_thread(void *arg)
{
struct redact_thread_arg *rt_arg = arg;
@@ -745,10 +746,8 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
bqueue_enqueue(q, record, sizeof (*record));
return (0);
}
- if (num_threads > 0) {
- redact_nodes = kmem_zalloc(num_threads *
- sizeof (*redact_nodes), KM_SLEEP);
- }
+ redact_nodes = vmem_zalloc(num_threads *
+ sizeof (*redact_nodes), KM_SLEEP);
avl_create(&start_tree, redact_node_compare_start,
sizeof (struct redact_node),
@@ -821,9 +820,9 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
avl_destroy(&start_tree);
avl_destroy(&end_tree);
- kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
+ vmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
if (current_record != NULL)
- bqueue_enqueue(q, current_record, sizeof (current_record));
+ bqueue_enqueue(q, current_record, sizeof (*current_record));
return (err);
}
@@ -836,7 +835,7 @@ struct redact_merge_thread_arg {
int error_code;
};
-static void
+static __attribute__((noreturn)) void
redact_merge_thread(void *arg)
{
struct redact_merge_thread_arg *rmta = arg;
@@ -854,7 +853,7 @@ redact_merge_thread(void *arg)
* object number.
*/
static int
-hold_next_object(objset_t *os, struct redact_record *rec, void *tag,
+hold_next_object(objset_t *os, struct redact_record *rec, const void *tag,
uint64_t *object, dnode_t **dn)
{
int err = 0;
@@ -913,7 +912,7 @@ perform_redaction(objset_t *os, redaction_list_t *rl,
object = prev_obj;
}
while (err == 0 && object <= rec->end_object) {
- if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ if (issig()) {
err = EINTR;
break;
}
@@ -1031,7 +1030,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
numsnaps = fnvlist_num_pairs(redactnvl);
if (numsnaps > 0)
- args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
+ args = vmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
nvpair_t *pair = NULL;
for (int i = 0; i < numsnaps; i++) {
@@ -1080,7 +1079,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
kmem_free(newredactbook,
sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
if (args != NULL)
- kmem_free(args, numsnaps * sizeof (*args));
+ vmem_free(args, numsnaps * sizeof (*args));
return (SET_ERROR(ENAMETOOLONG));
}
err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);
@@ -1120,7 +1119,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
} else {
uint64_t *guids = NULL;
if (numsnaps > 0) {
- guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
+ guids = vmem_zalloc(numsnaps * sizeof (uint64_t),
KM_SLEEP);
}
for (int i = 0; i < numsnaps; i++) {
@@ -1132,10 +1131,9 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
dp = NULL;
err = dsl_bookmark_create_redacted(newredactbook, snapname,
numsnaps, guids, FTAG, &new_rl);
- kmem_free(guids, numsnaps * sizeof (uint64_t));
- if (err != 0) {
+ vmem_free(guids, numsnaps * sizeof (uint64_t));
+ if (err != 0)
goto out;
- }
}
for (int i = 0; i < numsnaps; i++) {
@@ -1189,7 +1187,7 @@ out:
}
if (args != NULL)
- kmem_free(args, numsnaps * sizeof (*args));
+ vmem_free(args, numsnaps * sizeof (*args));
if (dp != NULL)
dsl_pool_rele(dp, FTAG);
if (ds != NULL) {
diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c
index 0658e13c2d25..cb2b62fed313 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_send.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_send.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -67,7 +67,7 @@
#endif
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
-int zfs_send_corrupt_data = B_FALSE;
+static int zfs_send_corrupt_data = B_FALSE;
/*
* This tunable controls the amount of data (measured in bytes) that will be
* prefetched by zfs send. If the main thread is blocking on reads that haven't
@@ -75,7 +75,7 @@ int zfs_send_corrupt_data = B_FALSE;
* thread is issuing new reads because the prefetches have fallen out of the
* cache, this may need to be decreased.
*/
-int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
+static uint_t zfs_send_queue_length = SPA_MAXBLOCKSIZE;
/*
* This tunable controls the length of the queues that zfs send worker threads
* use to communicate. If the send_main_thread is blocking on these queues,
@@ -83,7 +83,7 @@ int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
* at the start of a send as these threads consume all the available IO
* resources, this variable may need to be decreased.
*/
-int zfs_send_no_prefetch_queue_length = 1024 * 1024;
+static uint_t zfs_send_no_prefetch_queue_length = 1024 * 1024;
/*
* These tunables control the fill fraction of the queues by zfs send. The fill
* fraction controls the frequency with which threads have to be cv_signaled.
@@ -91,19 +91,19 @@ int zfs_send_no_prefetch_queue_length = 1024 * 1024;
* down. If the queues empty before the signalled thread can catch up, then
* these should be tuned up.
*/
-int zfs_send_queue_ff = 20;
-int zfs_send_no_prefetch_queue_ff = 20;
+static uint_t zfs_send_queue_ff = 20;
+static uint_t zfs_send_no_prefetch_queue_ff = 20;
/*
* Use this to override the recordsize calculation for fast zfs send estimates.
*/
-int zfs_override_estimate_recordsize = 0;
+static uint_t zfs_override_estimate_recordsize = 0;
/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
-int zfs_send_set_freerecords_bit = B_TRUE;
+static const boolean_t zfs_send_set_freerecords_bit = B_TRUE;
/* Set this tunable to FALSE is disable sending unmodified spill blocks. */
-int zfs_send_unmodified_spill_blocks = B_TRUE;
+static int zfs_send_unmodified_spill_blocks = B_TRUE;
static inline boolean_t
overflow_multiply(uint64_t a, uint64_t b, uint64_t *c)
@@ -165,6 +165,7 @@ struct send_range {
kmutex_t lock;
kcondvar_t cv;
boolean_t io_outstanding;
+ boolean_t io_compressed;
int io_err;
} data;
struct srh {
@@ -378,7 +379,7 @@ dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
}
}
/* create a FREE record and make it pending */
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_FREE;
drrf->drr_object = object;
drrf->drr_offset = offset;
@@ -437,7 +438,7 @@ dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
}
}
/* create a REDACT record and make it pending */
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_REDACT;
drrr->drr_object = object;
drrr->drr_offset = offset;
@@ -450,7 +451,8 @@ dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
static int
dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
- uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
+ uint64_t offset, int lsize, int psize, const blkptr_t *bp,
+ boolean_t io_compressed, void *data)
{
uint64_t payload_size;
boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
@@ -478,7 +480,7 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
dscp->dsc_pending_op = PENDING_NONE;
}
/* write a WRITE record */
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_WRITE;
drrw->drr_object = object;
drrw->drr_type = type;
@@ -487,7 +489,11 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
drrw->drr_logical_size = lsize;
/* only set the compression fields if the buf is compressed or raw */
- if (raw || lsize != psize) {
+ boolean_t compressed =
+ (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+ io_compressed : lsize != psize);
+ if (raw || compressed) {
+ ASSERT(bp != NULL);
ASSERT(raw || dscp->dsc_featureflags &
DMU_BACKUP_FEATURE_COMPRESSED);
ASSERT(!BP_IS_EMBEDDED(bp));
@@ -566,7 +572,7 @@ dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
ASSERT(BP_IS_EMBEDDED(bp));
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED;
drrw->drr_object = object;
drrw->drr_offset = offset;
@@ -579,7 +585,13 @@ dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
decode_embedded_bp_compressed(bp, buf);
- if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+ uint32_t psize = drrw->drr_psize;
+ uint32_t rsize = P2ROUNDUP(psize, 8);
+
+ if (psize != rsize)
+ memset(buf + psize, 0, rsize - psize);
+
+ if (dump_record(dscp, buf, rsize) != 0)
return (SET_ERROR(EINTR));
return (0);
}
@@ -599,7 +611,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
}
/* write a SPILL record */
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_SPILL;
drrs->drr_object = object;
drrs->drr_length = blksz;
@@ -607,7 +619,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
/* See comment in dump_dnode() for full details */
if (zfs_send_unmodified_spill_blocks &&
- (bp->blk_birth <= dscp->dsc_fromtxg)) {
+ (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
}
@@ -681,7 +693,7 @@ dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs)
}
/* write a FREEOBJECTS record */
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_FREEOBJECTS;
drrfo->drr_firstobj = firstobj;
drrfo->drr_numobjs = numobjs;
@@ -722,7 +734,7 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
}
/* write an OBJECT record */
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_OBJECT;
drro->drr_object = object;
drro->drr_type = dnp->dn_type;
@@ -758,6 +770,8 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
* to send it.
*/
if (bonuslen != 0) {
+ if (drro->drr_bonuslen > DN_MAX_BONUS_LEN(dnp))
+ return (SET_ERROR(EINVAL));
drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
bonuslen = drro->drr_raw_bonuslen;
}
@@ -790,11 +804,11 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
*/
if (zfs_send_unmodified_spill_blocks &&
(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
- (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
+ (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
struct send_range record;
blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
- bzero(&record, sizeof (struct send_range));
+ memset(&record, 0, sizeof (struct send_range));
record.type = DATA;
record.object = object;
record.eos_marker = B_FALSE;
@@ -834,7 +848,7 @@ dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp,
dscp->dsc_pending_op = PENDING_NONE;
}
- bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+ memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t));
dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE;
drror->drr_firstobj = firstobj;
drror->drr_numslots = numslots;
@@ -927,7 +941,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
if (BP_GET_TYPE(bp) == DMU_OT_SA) {
arc_flags_t aflags = ARC_FLAG_WAIT;
- enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+ zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
ASSERT(BP_IS_PROTECTED(bp));
@@ -1014,7 +1028,8 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
int n = MIN(srdp->datablksz,
SPA_OLD_MAXBLOCKSIZE);
err = dmu_dump_write(dscp, srdp->obj_type,
- range->object, offset, n, n, NULL, data);
+ range->object, offset, n, n, NULL, B_FALSE,
+ data);
offset += n;
/*
* When doing dry run, data==NULL is used as a
@@ -1028,7 +1043,8 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
} else {
err = dmu_dump_write(dscp, srdp->obj_type,
range->object, offset,
- srdp->datablksz, srdp->datasz, bp, data);
+ srdp->datablksz, srdp->datasz, bp,
+ srdp->io_compressed, data);
}
return (err);
}
@@ -1081,6 +1097,7 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL);
range->sru.data.io_outstanding = 0;
range->sru.data.io_err = 0;
+ range->sru.data.io_compressed = B_FALSE;
}
return (range);
}
@@ -1089,11 +1106,11 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
* This is the callback function to traverse_dataset that acts as a worker
* thread for dmu_send_impl.
*/
-/*ARGSUSED*/
static int
send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
{
+ (void) zilog;
struct send_thread_arg *sta = arg;
struct send_range *record;
@@ -1106,9 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
if (sta->os->os_encrypted &&
!BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
- spa_log_error(spa, zb);
- zfs_panic_recover("unencrypted block in encrypted "
- "object set %llu", dmu_objset_id(sta->os));
+ spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
return (SET_ERROR(EIO));
}
@@ -1126,7 +1141,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
record->sru.object.bp = *bp;
size_t size = sizeof (*dnp) * (dnp->dn_extra_slots + 1);
record->sru.object.dnp = kmem_alloc(size, KM_SLEEP);
- bcopy(dnp, record->sru.object.dnp, size);
+ memcpy(record->sru.object.dnp, dnp, size);
bqueue_enqueue(&sta->q, record, sizeof (*record));
return (0);
}
@@ -1224,7 +1239,7 @@ redact_list_cb(redact_block_phys_t *rb, void *arg)
* error code of the thread in case something goes wrong, and pushes the End of
* Stream record when the traverse_dataset call has finished.
*/
-static void
+static __attribute__((noreturn)) void
send_traverse_thread(void *arg)
{
struct send_thread_arg *st_arg = arg;
@@ -1314,7 +1329,7 @@ get_next_range(bqueue_t *bq, struct send_range *prev)
return (next);
}
-static void
+static __attribute__((noreturn)) void
redact_list_thread(void *arg)
{
struct redact_list_thread_arg *rlt_arg = arg;
@@ -1509,7 +1524,7 @@ find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask)
* data from the redact_list_thread and use that to determine which blocks
* should be redacted.
*/
-static void
+static __attribute__((noreturn)) void
send_merge_thread(void *arg)
{
struct send_merge_thread_arg *smt_arg = arg;
@@ -1576,8 +1591,6 @@ send_merge_thread(void *arg)
}
range_free(front_ranges[i]);
}
- if (range == NULL)
- range = kmem_zalloc(sizeof (*range), KM_SLEEP);
range->eos_marker = B_TRUE;
bqueue_enqueue_flush(&smt_arg->q, range, 1);
spl_fstrans_unmark(cookie);
@@ -1644,12 +1657,15 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
!split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
!BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
- enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+ zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
- if (srta->featureflags & DMU_BACKUP_FEATURE_RAW)
+ if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) {
zioflags |= ZIO_FLAG_RAW;
- else if (request_compressed)
+ srdp->io_compressed = B_TRUE;
+ } else if (request_compressed) {
zioflags |= ZIO_FLAG_RAW_COMPRESS;
+ srdp->io_compressed = B_TRUE;
+ }
srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ?
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp);
@@ -1701,8 +1717,10 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
struct send_range *range = range_alloc(range_type, dn->dn_object,
blkid, blkid + count, B_FALSE);
- if (blkid == DMU_SPILL_BLKID)
+ if (blkid == DMU_SPILL_BLKID) {
+ ASSERT3P(bp, !=, NULL);
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
+ }
switch (range_type) {
case HOLE:
@@ -1731,7 +1749,7 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
* some indirect blocks can be discarded because they're not holes. Second,
* it issues prefetches for the data we need to send.
*/
-static void
+static __attribute__((noreturn)) void
send_reader_thread(void *arg)
{
struct send_reader_thread_arg *srta = arg;
@@ -1823,8 +1841,7 @@ send_reader_thread(void *arg)
continue;
}
uint64_t file_max =
- (dn->dn_maxblkid < range->end_blkid ?
- dn->dn_maxblkid : range->end_blkid);
+ MIN(dn->dn_maxblkid, range->end_blkid);
/*
* The object exists, so we need to try to find the
* blkptr for each block in the range we're processing.
@@ -1900,7 +1917,7 @@ send_reader_thread(void *arg)
struct dmu_send_params {
/* Pool args */
- void *tag; // Tag that dp was held with, will be used to release dp.
+ const void *tag; // Tag dp was held with, will be used to release dp.
dsl_pool_t *dp;
/* To snapshot args */
const char *tosnap;
@@ -1936,7 +1953,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
{
dsl_dataset_t *to_ds = dspp->to_ds;
dsl_pool_t *dp = dspp->dp;
-#ifdef _KERNEL
+
if (dmu_objset_type(os) == DMU_OST_ZFS) {
uint64_t version;
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0)
@@ -1945,7 +1962,6 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
if (version >= ZPL_VERSION_SA)
*featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
}
-#endif
/* raw sends imply large_block_ok */
if ((dspp->rawok || dspp->large_block_ok) &&
@@ -2144,6 +2160,7 @@ setup_resume_points(struct dmu_send_params *dspp,
struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os,
redaction_list_t *redact_rl, nvlist_t *nvl)
{
+ (void) smt_arg;
dsl_dataset_t *to_ds = dspp->to_ds;
int err = 0;
@@ -2348,7 +2365,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
dsl_dataset_t *to_ds = dspp->to_ds;
zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb;
dsl_pool_t *dp = dspp->dp;
- void *tag = dspp->tag;
+ const void *tag = dspp->tag;
err = dmu_objset_from_ds(to_ds, &os);
if (err != 0) {
@@ -2497,8 +2514,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
}
if (featureflags & DMU_BACKUP_FEATURE_RAW) {
- uint64_t ivset_guid = (ancestor_zb != NULL) ?
- ancestor_zb->zbm_ivset_guid : 0;
+ uint64_t ivset_guid = ancestor_zb->zbm_ivset_guid;
nvlist_t *keynvl = NULL;
ASSERT(os->os_encrypted);
@@ -2536,7 +2552,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
while (err == 0 && !range->eos_marker) {
err = do_dump(&dsc, range);
range = get_next_range(&srt_arg->q, range);
- if (issig(JUSTLOOKING) && issig(FORREAL))
+ if (issig())
err = SET_ERROR(EINTR);
}
@@ -2583,7 +2599,7 @@ dmu_send_impl(struct dmu_send_params *dspp)
* the receive side that the stream is incomplete.
*/
if (!dspp->savedok) {
- bzero(drr, sizeof (dmu_replay_record_t));
+ memset(drr, 0, sizeof (dmu_replay_record_t));
drr->drr_type = DRR_END;
drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc;
drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid;
@@ -2684,7 +2700,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
uint64_t size = dspp.numfromredactsnaps *
sizeof (uint64_t);
dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP);
- bcopy(fromredact, dspp.fromredactsnaps, size);
+ memcpy(dspp.fromredactsnaps, fromredact, size);
}
boolean_t is_before =
@@ -2702,6 +2718,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
err = dmu_send_impl(&dspp);
}
+ if (dspp.fromredactsnaps)
+ kmem_free(dspp.fromredactsnaps,
+ dspp.numfromredactsnaps * sizeof (uint64_t));
+
dsl_dataset_rele(dspp.to_ds, FTAG);
return (err);
}
@@ -2770,6 +2790,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
}
if (err == 0) {
+ owned = B_TRUE;
err = zap_lookup(dspp.dp->dp_meta_objset,
dspp.to_ds->ds_object,
DS_FIELD_RESUME_TOGUID, 8, 1,
@@ -2783,21 +2804,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
sizeof (dspp.saved_toname),
dspp.saved_toname);
}
- if (err != 0)
+ /* Only disown if there was an error in the lookups */
+ if (owned && (err != 0))
dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
kmem_strfree(name);
} else {
err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
FTAG, &dspp.to_ds);
+ if (err == 0)
+ owned = B_TRUE;
}
- owned = B_TRUE;
} else {
err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
&dspp.to_ds);
}
if (err != 0) {
+ /* Note: dsl dataset is not owned at this point */
dsl_pool_rele(dspp.dp, FTAG);
return (err);
}
@@ -2869,7 +2893,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
sizeof (uint64_t);
dspp.fromredactsnaps = kmem_zalloc(size,
KM_SLEEP);
- bcopy(fromredact, dspp.fromredactsnaps,
+ memcpy(dspp.fromredactsnaps, fromredact,
size);
}
if (!dsl_dataset_is_before(dspp.to_ds, fromds,
@@ -2910,6 +2934,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
/* dmu_send_impl will call dsl_pool_rele for us. */
err = dmu_send_impl(&dspp);
} else {
+ if (dspp.fromredactsnaps)
+ kmem_free(dspp.fromredactsnaps,
+ dspp.numfromredactsnaps *
+ sizeof (uint64_t));
dsl_pool_rele(dspp.dp, FTAG);
}
} else {
@@ -3002,7 +3030,7 @@ dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds,
dsl_dataset_name(origds, dsname);
(void) strcat(dsname, "/");
- (void) strcat(dsname, recv_clone_name);
+ (void) strlcat(dsname, recv_clone_name, sizeof (dsname));
err = dsl_dataset_hold(origds->ds_dir->dd_pool,
dsname, FTAG, &ds);
@@ -3072,25 +3100,23 @@ out:
return (err);
}
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW,
"Allow sending corrupt data");
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, UINT, ZMOD_RW,
"Maximum send queue length");
ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW,
"Send unmodified spill blocks");
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, UINT, ZMOD_RW,
"Maximum send queue length for non-prefetch queues");
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, UINT, ZMOD_RW,
"Send queue fill fraction");
-ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, UINT, ZMOD_RW,
"Send queue fill fraction for non-prefetch queues");
-ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, UINT, ZMOD_RW,
"Override block size estimate with fixed size");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
index 862c0bf404ad..15cc2885e805 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_traverse.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -39,9 +39,9 @@
#include <sys/callb.h>
#include <sys/zfeature.h>
-int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
-int32_t send_holes_without_birth_time = 1;
-int32_t zfs_traverse_indirect_prefetch_limit = 32;
+static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
+static int32_t send_holes_without_birth_time = 1;
+static uint_t zfs_traverse_indirect_prefetch_limit = 32;
typedef struct prefetch_data {
kmutex_t pd_mtx;
@@ -83,7 +83,8 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
if (BP_IS_HOLE(bp))
return (0);
- if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+ if (claim_txg == 0 &&
+ BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
return (-1);
SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -108,9 +109,10 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
if (BP_IS_HOLE(bp))
return (0);
- if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
return (0);
+ ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
@@ -153,10 +155,10 @@ typedef enum resume_skip {
* Otherwise returns RESUME_SKIP_NONE.
*/
static resume_skip_t
-resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp,
const zbookmark_phys_t *zb)
{
- if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+ if (td->td_resume != NULL) {
/*
* If we already visited this bp & everything below,
* don't bother doing it again.
@@ -164,12 +166,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
return (RESUME_SKIP_ALL);
- /*
- * If we found the block we're trying to resume from, zero
- * the bookmark out to indicate that we have resumed.
- */
- if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
- bzero(td->td_resume, sizeof (*zb));
+ if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
if (td->td_flags & TRAVERSE_POST)
return (RESUME_SKIP_CHILDREN);
}
@@ -181,22 +178,22 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
* Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
*/
static boolean_t
-traverse_prefetch_metadata(traverse_data_t *td,
+traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
- arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return (B_FALSE);
/*
- * If we are in the process of resuming, don't prefetch, because
- * some children will not be needed (and in fact may have already
- * been freed).
+ * If this bp is before the resume point, it may have already been
+ * freed.
*/
- if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
+ if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
return (B_FALSE);
- if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+ if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
return (B_FALSE);
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return (B_FALSE);
@@ -239,7 +236,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
ASSERT(0);
}
- if (bp->blk_birth == 0) {
+ if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
/*
* Since this block has a birth time of 0 it must be one of
* two things: a hole created before the
@@ -267,7 +264,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
zb->zb_object == DMU_META_DNODE_OBJECT) &&
td->td_hole_birth_enabled_txg <= td->td_min_txg)
return (0);
- } else if (bp->blk_birth <= td->td_min_txg) {
+ } else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
return (0);
}
@@ -342,7 +339,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
SET_BOOKMARK(czb, zb->zb_objset,
zb->zb_object, zb->zb_level - 1,
zb->zb_blkid * epb + pidx);
- if (traverse_prefetch_metadata(td,
+ if (traverse_prefetch_metadata(td, dnp,
&((blkptr_t *)buf->b_data)[pidx],
czb) == B_TRUE) {
prefetched++;
@@ -504,12 +501,12 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
- traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
+ traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb);
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
+ traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
}
}
@@ -560,11 +557,11 @@ traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,
return (err);
}
-/* ARGSUSED */
static int
traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
+ (void) zilog, (void) dnp;
prefetch_data_t *pfd = arg;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
@@ -670,7 +667,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
/* See comment on ZIL traversal in dsl_scan_visitds. */
if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
- enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
uint32_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;
@@ -809,11 +806,10 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
EXPORT_SYMBOL(traverse_dataset);
EXPORT_SYMBOL(traverse_pool);
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
"Max number of bytes to prefetch");
-ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW,
"Traverse prefetch number of blocks pointed by indirect block");
#if defined(_KERNEL)
@@ -822,6 +818,6 @@ MODULE_PARM_DESC(ignore_hole_birth,
"Alias for send_holes_without_birth_time");
#endif
+/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,
"Ignore hole_birth txg for zfs send");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c
index 5fa516866668..8451b5082e86 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_tx.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
- { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 },
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
+ { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 },
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
};
@@ -210,16 +210,22 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- db = dbuf_hold_level(dn, level, blkid, FTAG);
+ err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock);
- if (db == NULL)
- return (SET_ERROR(EIO));
- err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+ if (err == ENOENT)
+ return (0);
+ if (err != 0)
+ return (err);
+ /*
+ * PARTIAL_FIRST allows caching for uncacheable blocks. It will
+ * be cleared after dmu_buf_will_dirty() call dbuf_read() again.
+ */
+ err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH |
+ (level == 0 ? DB_RF_PARTIAL_FIRST : 0));
dbuf_rele(db, FTAG);
return (err);
}
-/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
@@ -291,6 +297,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
}
static void
+dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+ dnode_t *dn = txh->txh_dnode;
+ int err = 0;
+
+ if (len == 0)
+ return;
+
+ (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
+
+ if (dn == NULL)
+ return;
+
+ /*
+ * For i/o error checking, read the blocks that will be needed
+ * to perform the append; first level-0 block (if not aligned, i.e.
+ * if they are partial-block writes), no additional blocks are read.
+ */
+ if (dn->dn_maxblkid == 0) {
+ if (off < dn->dn_datablksz &&
+ (off > 0 || len < dn->dn_datablksz)) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+ } else {
+ zio_t *zio = zio_root(dn->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ uint64_t start = off >> dn->dn_datablkshift;
+ if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err != 0) {
+ txh->txh_tx->tx_err = err;
+ }
+ }
+}
+
+static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
(void) zfs_refcount_add_many(&txh->txh_space_towrite,
@@ -331,6 +384,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
}
/*
+ * Should be used when appending to an object and the exact offset is unknown.
+ * The write must occur at or beyond the specified offset. Only the L0 block
+ * at provided offset will be prefetched.
+ */
+void
+dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ object, THT_APPEND, off, DMU_OBJECT_END);
+ if (txh != NULL) {
+ dmu_tx_count_append(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+void
+dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT3U(len, <=, DMU_MAX_ACCESS);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END);
+ if (txh != NULL) {
+ dmu_tx_count_append(txh, off, len);
+ dmu_tx_count_dnode(txh);
+ }
+}
+
+/*
* This function marks the transaction as being a "net free". The end
* result is that refquotas will be disabled for this transaction, and
* this transaction will be able to use half of the pool space overhead
@@ -345,7 +434,7 @@ dmu_tx_mark_netfree(dmu_tx_t *tx)
}
static void
-dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
dmu_tx_t *tx = txh->txh_tx;
dnode_t *dn = txh->txh_dnode;
@@ -353,15 +442,11 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
ASSERT(tx->tx_txg == 0);
- dmu_tx_count_dnode(txh);
-
if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
- dmu_tx_count_dnode(txh);
-
/*
* For i/o error checking, we read the first and last level-0
* blocks if they are not aligned, and all the level-1 blocks.
@@ -441,8 +526,10 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
object, THT_FREE, off, len);
- if (txh != NULL)
- (void) dmu_tx_hold_free_impl(txh, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_free(txh, off, len);
+ }
}
void
@@ -451,8 +538,35 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
dmu_tx_hold_t *txh;
txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
- if (txh != NULL)
- (void) dmu_tx_hold_free_impl(txh, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_free(txh, off, len);
+ }
+}
+
+static void
+dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+
+ /*
+ * Reuse dmu_tx_count_free(), it does exactly what we need for clone.
+ */
+ dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+ dmu_tx_hold_t *txh;
+
+ ASSERT0(tx->tx_txg);
+ ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+ txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len);
+ if (txh != NULL) {
+ dmu_tx_count_dnode(txh);
+ dmu_tx_count_clone(txh, off, len);
+ }
}
static void
@@ -461,6 +575,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
dmu_tx_t *tx = txh->txh_tx;
dnode_t *dn = txh->txh_dnode;
int err;
+ extern int zap_micro_max_size;
ASSERT(tx->tx_txg == 0);
@@ -476,7 +591,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
* - 2 grown ptrtbl blocks
*/
(void) zfs_refcount_add_many(&txh->txh_space_towrite,
- MZAP_MAX_BLKSZ, FTAG);
+ zap_micro_max_size, FTAG);
if (dn == NULL)
return;
@@ -638,6 +753,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
if (blkid == 0)
match_offset = TRUE;
break;
+ case THT_APPEND:
+ if (blkid >= beginblk && (blkid <= endblk ||
+ txh->txh_arg2 == DMU_OBJECT_END))
+ match_offset = TRUE;
+
+ /*
+ * THT_WRITE used for bonus and spill blocks.
+ */
+ ASSERT(blkid != DMU_BONUS_BLKID &&
+ blkid != DMU_SPILL_BLKID);
+
+ /*
+ * They might have to increase nlevels,
+ * thus dirtying the new TLIBs. Or the
+ * might have to change the block size,
+ * thus dirying the new lvl=0 blk=0.
+ */
+ if (blkid == 0)
+ match_offset = TRUE;
+ break;
case THT_FREE:
/*
* We will dirty all the level 1 blocks in
@@ -662,6 +797,10 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
case THT_NEWOBJECT:
match_object = TRUE;
break;
+ case THT_CLONE:
+ if (blkid >= beginblk && blkid <= endblk)
+ match_offset = TRUE;
+ break;
default:
cmn_err(CE_PANIC, "bad txh_type %d",
txh->txh_type);
@@ -683,8 +822,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
* If we can't do 10 iops, something is wrong. Let us go ahead
* and hit zfs_dirty_data_max.
*/
-hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
-int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
/*
* We delay transactions when we've determined that the backend storage
@@ -781,34 +919,49 @@ static void
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
{
dsl_pool_t *dp = tx->tx_pool;
- uint64_t delay_min_bytes =
+ uint64_t delay_min_bytes, wrlog;
+ hrtime_t wakeup, tx_time = 0, now;
+
+ /* Calculate minimum transaction time for the dirty data amount. */
+ delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
- hrtime_t wakeup, min_tx_time, now;
+ if (dirty > delay_min_bytes) {
+ /*
+ * The caller has already waited until we are under the max.
+ * We make them pass us the amount of dirty data so we don't
+ * have to handle the case of it being >= the max, which
+ * could cause a divide-by-zero if it's == the max.
+ */
+ ASSERT3U(dirty, <, zfs_dirty_data_max);
- if (dirty <= delay_min_bytes)
- return;
+ tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
+ (zfs_dirty_data_max - dirty);
+ }
- /*
- * The caller has already waited until we are under the max.
- * We make them pass us the amount of dirty data so we don't
- * have to handle the case of it being >= the max, which could
- * cause a divide-by-zero if it's == the max.
- */
- ASSERT3U(dirty, <, zfs_dirty_data_max);
+ /* Calculate minimum transaction time for the TX_WRITE log size. */
+ wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
+ delay_min_bytes =
+ zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
+ if (wrlog >= zfs_wrlog_data_max) {
+ tx_time = zfs_delay_max_ns;
+ } else if (wrlog > delay_min_bytes) {
+ tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
+ (zfs_wrlog_data_max - wrlog), tx_time);
+ }
+
+ if (tx_time == 0)
+ return;
+ tx_time = MIN(tx_time, zfs_delay_max_ns);
now = gethrtime();
- min_tx_time = zfs_delay_scale *
- (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
- min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
- if (now > tx->tx_start + min_tx_time)
+ if (now > tx->tx_start + tx_time)
return;
DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
- uint64_t, min_tx_time);
+ uint64_t, tx_time);
mutex_enter(&dp->dp_lock);
- wakeup = MAX(tx->tx_start + min_tx_time,
- dp->dp_last_wakeup + min_tx_time);
+ wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
dp->dp_last_wakeup = wakeup;
mutex_exit(&dp->dp_lock);
@@ -886,8 +1039,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
}
if (!tx->tx_dirty_delayed &&
- dsl_pool_wrlog_over_max(tx->tx_pool)) {
- DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
+ dsl_pool_need_wrlog_delay(tx->tx_pool)) {
+ tx->tx_wait_dirty = B_TRUE;
+ DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
return (SET_ERROR(ERESTART));
}
@@ -1244,8 +1398,7 @@ dmu_tx_do_callbacks(list_t *cb_list, int error)
{
dmu_tx_callback_t *dcb;
- while ((dcb = list_tail(cb_list)) != NULL) {
- list_remove(cb_list, dcb);
+ while ((dcb = list_remove_tail(cb_list)) != NULL) {
dcb->dcb_func(dcb->dcb_data, error);
kmem_free(dcb, sizeof (dmu_tx_callback_t));
}
@@ -1405,6 +1558,8 @@ dmu_tx_fini(void)
EXPORT_SYMBOL(dmu_tx_create);
EXPORT_SYMBOL(dmu_tx_hold_write);
EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_append);
+EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode);
EXPORT_SYMBOL(dmu_tx_hold_free);
EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);
EXPORT_SYMBOL(dmu_tx_hold_zap);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 043344a1375f..ed50f1889b59 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -28,6 +28,7 @@
*/
#include <sys/zfs_context.h>
+#include <sys/arc_impl.h>
#include <sys/dnode.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_zfetch.h>
@@ -43,38 +44,63 @@
* so it can't hurt performance.
*/
-int zfs_prefetch_disable = B_FALSE;
+static int zfs_prefetch_disable = B_FALSE;
/* max # of streams per zfetch */
-unsigned int zfetch_max_streams = 8;
+static unsigned int zfetch_max_streams = 8;
/* min time before stream reclaim */
-unsigned int zfetch_min_sec_reap = 2;
+static unsigned int zfetch_min_sec_reap = 1;
+/* max time before stream delete */
+static unsigned int zfetch_max_sec_reap = 2;
+#ifdef _ILP32
+/* min bytes to prefetch per stream (default 2MB) */
+static unsigned int zfetch_min_distance = 2 * 1024 * 1024;
/* max bytes to prefetch per stream (default 8MB) */
unsigned int zfetch_max_distance = 8 * 1024 * 1024;
+#else
+/* min bytes to prefetch per stream (default 4MB) */
+static unsigned int zfetch_min_distance = 4 * 1024 * 1024;
+/* max bytes to prefetch per stream (default 64MB) */
+unsigned int zfetch_max_distance = 64 * 1024 * 1024;
+#endif
/* max bytes to prefetch indirects for per stream (default 64MB) */
unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
-/* max number of bytes in an array_read in which we allow prefetching (1MB) */
-unsigned long zfetch_array_rd_sz = 1024 * 1024;
+/* max request reorder distance within a stream (default 16MB) */
+unsigned int zfetch_max_reorder = 16 * 1024 * 1024;
+/* Max log2 fraction of holes in a stream */
+unsigned int zfetch_hole_shift = 2;
typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
+ kstat_named_t zfetchstat_future;
+ kstat_named_t zfetchstat_stride;
+ kstat_named_t zfetchstat_past;
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_max_streams;
kstat_named_t zfetchstat_io_issued;
+ kstat_named_t zfetchstat_io_active;
} zfetch_stats_t;
static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
+ { "future", KSTAT_DATA_UINT64 },
+ { "stride", KSTAT_DATA_UINT64 },
+ { "past", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "max_streams", KSTAT_DATA_UINT64 },
- { "io_issued", KSTAT_DATA_UINT64 },
+ { "io_issued", KSTAT_DATA_UINT64 },
+ { "io_active", KSTAT_DATA_UINT64 },
};
struct {
wmsum_t zfetchstat_hits;
+ wmsum_t zfetchstat_future;
+ wmsum_t zfetchstat_stride;
+ wmsum_t zfetchstat_past;
wmsum_t zfetchstat_misses;
wmsum_t zfetchstat_max_streams;
wmsum_t zfetchstat_io_issued;
+ aggsum_t zfetchstat_io_active;
} zfetch_sums;
#define ZFETCHSTAT_BUMP(stat) \
@@ -83,7 +109,7 @@ struct {
wmsum_add(&zfetch_sums.stat, val)
-kstat_t *zfetch_ksp;
+static kstat_t *zfetch_ksp;
static int
zfetch_kstats_update(kstat_t *ksp, int rw)
@@ -94,12 +120,20 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
return (EACCES);
zs->zfetchstat_hits.value.ui64 =
wmsum_value(&zfetch_sums.zfetchstat_hits);
+ zs->zfetchstat_future.value.ui64 =
+ wmsum_value(&zfetch_sums.zfetchstat_future);
+ zs->zfetchstat_stride.value.ui64 =
+ wmsum_value(&zfetch_sums.zfetchstat_stride);
+ zs->zfetchstat_past.value.ui64 =
+ wmsum_value(&zfetch_sums.zfetchstat_past);
zs->zfetchstat_misses.value.ui64 =
wmsum_value(&zfetch_sums.zfetchstat_misses);
zs->zfetchstat_max_streams.value.ui64 =
wmsum_value(&zfetch_sums.zfetchstat_max_streams);
zs->zfetchstat_io_issued.value.ui64 =
wmsum_value(&zfetch_sums.zfetchstat_io_issued);
+ zs->zfetchstat_io_active.value.ui64 =
+ aggsum_value(&zfetch_sums.zfetchstat_io_active);
return (0);
}
@@ -107,9 +141,13 @@ void
zfetch_init(void)
{
wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
+ wmsum_init(&zfetch_sums.zfetchstat_future, 0);
+ wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
+ wmsum_init(&zfetch_sums.zfetchstat_past, 0);
wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
+ aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
@@ -131,9 +169,14 @@ zfetch_fini(void)
}
wmsum_fini(&zfetch_sums.zfetchstat_hits);
+ wmsum_fini(&zfetch_sums.zfetchstat_future);
+ wmsum_fini(&zfetch_sums.zfetchstat_stride);
+ wmsum_fini(&zfetch_sums.zfetchstat_past);
wmsum_fini(&zfetch_sums.zfetchstat_misses);
wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
+ ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active));
+ aggsum_fini(&zfetch_sums.zfetchstat_io_active);
}
/*
@@ -195,75 +238,219 @@ dmu_zfetch_fini(zfetch_t *zf)
}
/*
- * If there aren't too many streams already, create a new stream.
+ * If there aren't too many active streams already, create one more.
+ * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
+ * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
* The "blkid" argument is the next block that we expect this stream to access.
- * While we're here, clean up old streams (which haven't been
- * accessed for at least zfetch_min_sec_reap seconds).
*/
static void
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
{
- zstream_t *zs_next;
- hrtime_t now = gethrtime();
+ zstream_t *zs, *zs_next, *zs_old = NULL;
+ uint_t now = gethrestime_sec(), t;
ASSERT(MUTEX_HELD(&zf->zf_lock));
/*
- * Clean up old streams.
+ * Delete too old streams, reusing the first found one.
*/
- for (zstream_t *zs = list_head(&zf->zf_stream);
- zs != NULL; zs = zs_next) {
+ t = now - zfetch_max_sec_reap;
+ for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
zs_next = list_next(&zf->zf_stream, zs);
/*
* Skip if still active. 1 -- zf_stream reference.
*/
+ if ((int)(zs->zs_atime - t) >= 0)
+ continue;
if (zfs_refcount_count(&zs->zs_refs) != 1)
continue;
- if (((now - zs->zs_atime) / NANOSEC) >
- zfetch_min_sec_reap)
+ if (zs_old)
dmu_zfetch_stream_remove(zf, zs);
+ else
+ zs_old = zs;
+ }
+ if (zs_old) {
+ zs = zs_old;
+ list_remove(&zf->zf_stream, zs);
+ goto reuse;
}
/*
* The maximum number of streams is normally zfetch_max_streams,
* but for small files we lower it such that it's at least possible
* for all the streams to be non-overlapping.
- *
- * If we are already at the maximum number of streams for this file,
- * even after removing old streams, then don't create this stream.
*/
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
- zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+ (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
zfetch_max_distance));
if (zf->zf_numstreams >= max_streams) {
+ t = now - zfetch_min_sec_reap;
+ for (zs = list_head(&zf->zf_stream); zs != NULL;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if ((int)(zs->zs_atime - t) >= 0)
+ continue;
+ if (zfs_refcount_count(&zs->zs_refs) != 1)
+ continue;
+ if (zs_old == NULL ||
+ (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
+ zs_old = zs;
+ }
+ if (zs_old) {
+ zs = zs_old;
+ list_remove(&zf->zf_stream, zs);
+ goto reuse;
+ }
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
return;
}
- zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
- zs->zs_blkid = blkid;
- zs->zs_pf_blkid1 = blkid;
- zs->zs_pf_blkid = blkid;
- zs->zs_ipf_blkid1 = blkid;
- zs->zs_ipf_blkid = blkid;
- zs->zs_atime = now;
- zs->zs_fetch = zf;
- zs->zs_missed = B_FALSE;
+ zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
zfs_refcount_create(&zs->zs_callers);
zfs_refcount_create(&zs->zs_refs);
/* One reference for zf_stream. */
zfs_refcount_add(&zs->zs_refs, NULL);
zf->zf_numstreams++;
+
+reuse:
list_insert_head(&zf->zf_stream, zs);
+ zs->zs_blkid = blkid;
+ /* Allow immediate stream reuse until first hit. */
+ zs->zs_atime = now - zfetch_min_sec_reap;
+ memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
+ zs->zs_pf_dist = 0;
+ zs->zs_ipf_dist = 0;
+ zs->zs_pf_start = blkid;
+ zs->zs_pf_end = blkid;
+ zs->zs_ipf_start = blkid;
+ zs->zs_ipf_end = blkid;
+ zs->zs_missed = B_FALSE;
+ zs->zs_more = B_FALSE;
}
static void
-dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
+dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
{
zstream_t *zs = arg;
+ if (io_issued && level == 0 && blkid < zs->zs_blkid)
+ zs->zs_more = B_TRUE;
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
dmu_zfetch_stream_fini(zs);
+ aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
+}
+
+/*
+ * Process stream hit access for nblks blocks starting at zs_blkid. Return
+ * number of blocks to proceed for after aggregation with future ranges.
+ */
+static uint64_t
+dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
+{
+ uint_t i, j;
+
+ /* Optimize sequential accesses (no future ranges). */
+ if (zs->zs_ranges[0].start == 0)
+ goto done;
+
+ /* Look for intersections with further ranges. */
+ for (i = 0; i < ZFETCH_RANGES; i++) {
+ zsrange_t *r = &zs->zs_ranges[i];
+ if (r->start == 0 || r->start > nblks)
+ break;
+ if (r->end >= nblks) {
+ nblks = r->end;
+ i++;
+ break;
+ }
+ }
+
+ /* Delete all found intersecting ranges, updates remaining. */
+ for (j = 0; i < ZFETCH_RANGES; i++, j++) {
+ if (zs->zs_ranges[i].start == 0)
+ break;
+ ASSERT3U(zs->zs_ranges[i].start, >, nblks);
+ ASSERT3U(zs->zs_ranges[i].end, >, nblks);
+ zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
+ zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
+ }
+ if (j < ZFETCH_RANGES) {
+ zs->zs_ranges[j].start = 0;
+ zs->zs_ranges[j].end = 0;
+ }
+
+done:
+ zs->zs_blkid += nblks;
+ return (nblks);
+}
+
+/*
+ * Process future stream access for nblks blocks starting at blkid. Return
+ * number of blocks to proceed for if future ranges reach fill threshold.
+ */
+static uint64_t
+dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
+{
+ ASSERT3U(blkid, >, zs->zs_blkid);
+ blkid -= zs->zs_blkid;
+ ASSERT3U(blkid + nblks, <=, UINT16_MAX);
+
+ /* Search for first and last intersection or insert point. */
+ uint_t f = ZFETCH_RANGES, l = 0, i;
+ for (i = 0; i < ZFETCH_RANGES; i++) {
+ zsrange_t *r = &zs->zs_ranges[i];
+ if (r->start == 0 || r->start > blkid + nblks)
+ break;
+ if (r->end < blkid)
+ continue;
+ if (f > i)
+ f = i;
+ if (l < i)
+ l = i;
+ }
+ if (f <= l) {
+ /* Got some intersecting range, expand it if needed. */
+ if (zs->zs_ranges[f].start > blkid)
+ zs->zs_ranges[f].start = blkid;
+ zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
+ if (f < l) {
+ /* Got more than one intersection, remove others. */
+ for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
+ zs->zs_ranges[f].start = zs->zs_ranges[l].start;
+ zs->zs_ranges[f].end = zs->zs_ranges[l].end;
+ }
+ zs->zs_ranges[f].start = 0;
+ zs->zs_ranges[f].end = 0;
+ }
+ } else if (i < ZFETCH_RANGES) {
+ /* Got no intersecting ranges, insert new one. */
+ for (l = ZFETCH_RANGES - 1; l > i; l--) {
+ zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
+ zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
+ }
+ zs->zs_ranges[i].start = blkid;
+ zs->zs_ranges[i].end = blkid + nblks;
+ } else {
+ /* No space left to insert. Drop the range. */
+ return (0);
+ }
+
+ /* Check if with the new access addition we reached fill threshold. */
+ if (zfetch_hole_shift >= 16)
+ return (0);
+ uint_t hole = 0;
+ for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
+ zsrange_t *r = &zs->zs_ranges[i];
+ if (r->start == 0)
+ break;
+ hole += r->start - f;
+ f = r->end;
+ if (hole <= r->end >> zfetch_hole_shift)
+ l = r->end;
+ }
+ if (l > 0)
+ return (dmu_zfetch_hit(zs, l));
+
+ return (0);
}
/*
@@ -283,15 +470,15 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
boolean_t fetch_data, boolean_t have_lock)
{
zstream_t *zs;
- int64_t pf_start, ipf_start;
- int64_t pf_ahead_blks, max_blks;
- int max_dist_blks, pf_nblks, ipf_nblks;
- uint64_t end_of_access_blkid, maxblkid;
- end_of_access_blkid = blkid + nblks;
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
+ zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch;
- if (zfs_prefetch_disable)
+ if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE)
return (NULL);
+
+ if (os_prefetch == ZFS_PREFETCH_METADATA)
+ fetch_data = B_FALSE;
+
/*
* If we haven't yet loaded the indirect vdevs' mappings, we
* can only read from blocks that we carefully ensure are on
@@ -316,7 +503,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
* A fast path for small files for which no prefetch will
* happen.
*/
- maxblkid = zf->zf_dnode->dn_maxblkid;
+ uint64_t maxblkid = zf->zf_dnode->dn_maxblkid;
if (maxblkid < 2) {
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
@@ -325,115 +512,147 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
mutex_enter(&zf->zf_lock);
/*
- * Find matching prefetch stream. Depending on whether the accesses
+ * Find perfect prefetch stream. Depending on whether the accesses
* are block-aligned, first block of the new access may either follow
* the last block of the previous access, or be equal to it.
*/
+ unsigned int dbs = zf->zf_dnode->dn_datablkshift;
+ uint64_t end_blkid = blkid + nblks;
for (zs = list_head(&zf->zf_stream); zs != NULL;
zs = list_next(&zf->zf_stream, zs)) {
if (blkid == zs->zs_blkid) {
- break;
+ goto hit;
} else if (blkid + 1 == zs->zs_blkid) {
blkid++;
nblks--;
- break;
+ goto hit;
}
}
/*
- * If the file is ending, remove the matching stream if found.
- * If not found then it is too late to create a new one now.
+ * Find close enough prefetch stream. Access crossing stream position
+ * is a hit in its new part. Access ahead of stream position considered
+ * a hit for metadata prefetch, since we do not care about fill percent,
+ * or stored for future otherwise. Access behind stream position is
+ * silently ignored, since we already skipped it reaching fill percent.
*/
- if (end_of_access_blkid >= maxblkid) {
- if (zs != NULL)
- dmu_zfetch_stream_remove(zf, zs);
- mutex_exit(&zf->zf_lock);
- if (!have_lock)
- rw_exit(&zf->zf_dnode->dn_struct_rwlock);
- return (NULL);
+ uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
+ uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
+ for (zs = list_head(&zf->zf_stream); zs != NULL;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (blkid > zs->zs_blkid) {
+ if (end_blkid <= zs->zs_blkid + max_reorder) {
+ if (!fetch_data) {
+ nblks = dmu_zfetch_hit(zs,
+ end_blkid - zs->zs_blkid);
+ ZFETCHSTAT_BUMP(zfetchstat_stride);
+ goto future;
+ }
+ nblks = dmu_zfetch_future(zs, blkid, nblks);
+ if (nblks > 0)
+ ZFETCHSTAT_BUMP(zfetchstat_stride);
+ else
+ ZFETCHSTAT_BUMP(zfetchstat_future);
+ goto future;
+ }
+ } else if (end_blkid >= zs->zs_blkid) {
+ nblks -= zs->zs_blkid - blkid;
+ blkid += zs->zs_blkid - blkid;
+ goto hit;
+ } else if (end_blkid + max_reorder > zs->zs_blkid &&
+ (int)(zs->zs_atime - t) >= 0) {
+ ZFETCHSTAT_BUMP(zfetchstat_past);
+ zs->zs_atime = gethrestime_sec();
+ goto out;
+ }
}
- /* Exit if we already prefetched this block before. */
- if (nblks == 0) {
- mutex_exit(&zf->zf_lock);
- if (!have_lock)
- rw_exit(&zf->zf_dnode->dn_struct_rwlock);
- return (NULL);
- }
+ /*
+ * This access is not part of any existing stream. Create a new
+ * stream for it unless we are at the end of file.
+ */
+ if (end_blkid < maxblkid)
+ dmu_zfetch_stream_create(zf, end_blkid);
+ mutex_exit(&zf->zf_lock);
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ ZFETCHSTAT_BUMP(zfetchstat_misses);
+ return (NULL);
- if (zs == NULL) {
- /*
- * This access is not part of any existing stream. Create
- * a new stream for it.
- */
- dmu_zfetch_stream_create(zf, end_of_access_blkid);
+hit:
+ nblks = dmu_zfetch_hit(zs, nblks);
+ ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+future:
+ zs->zs_atime = gethrestime_sec();
+
+ /* Exit if we already prefetched for this position before. */
+ if (nblks == 0)
+ goto out;
+
+ /* If the file is ending, remove the stream. */
+ end_blkid = zs->zs_blkid;
+ if (end_blkid >= maxblkid) {
+ dmu_zfetch_stream_remove(zf, zs);
+out:
mutex_exit(&zf->zf_lock);
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
- ZFETCHSTAT_BUMP(zfetchstat_misses);
return (NULL);
}
/*
* This access was to a block that we issued a prefetch for on
- * behalf of this stream. Issue further prefetches for this stream.
+ * behalf of this stream. Calculate further prefetch distances.
*
- * Normally, we start prefetching where we stopped
- * prefetching last (zs_pf_blkid). But when we get our first
- * hit on this stream, zs_pf_blkid == zs_blkid, we don't
- * want to prefetch the block we just accessed. In this case,
- * start just after the block we just accessed.
- */
- pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
- if (zs->zs_pf_blkid1 < end_of_access_blkid)
- zs->zs_pf_blkid1 = end_of_access_blkid;
- if (zs->zs_ipf_blkid1 < end_of_access_blkid)
- zs->zs_ipf_blkid1 = end_of_access_blkid;
-
- /*
- * Double our amount of prefetched data, but don't let the
- * prefetch get further ahead than zfetch_max_distance.
+ * Start prefetch from the demand access size (nblks). Double the
+ * distance every access up to zfetch_min_distance. After that only
+ * if needed increase the distance by 1/8 up to zfetch_max_distance.
+ *
+ * Don't double the distance beyond single block if we have more
+ * than ~6% of ARC held by active prefetches. It should help with
+ * getting out of RAM on some badly mispredicted read patterns.
*/
+ unsigned int nbytes = nblks << dbs;
+ unsigned int pf_nblks;
if (fetch_data) {
- max_dist_blks =
- zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
- /*
- * Previously, we were (zs_pf_blkid - blkid) ahead. We
- * want to now be double that, so read that amount again,
- * plus the amount we are catching up by (i.e. the amount
- * read just now).
- */
- pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
- max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
- pf_nblks = MIN(pf_ahead_blks, max_blks);
+ if (unlikely(zs->zs_pf_dist < nbytes))
+ zs->zs_pf_dist = nbytes;
+ else if (zs->zs_pf_dist < zfetch_min_distance &&
+ (zs->zs_pf_dist < (1 << dbs) ||
+ aggsum_compare(&zfetch_sums.zfetchstat_io_active,
+ arc_c_max >> (4 + dbs)) < 0))
+ zs->zs_pf_dist *= 2;
+ else if (zs->zs_more)
+ zs->zs_pf_dist += zs->zs_pf_dist / 8;
+ zs->zs_more = B_FALSE;
+ if (zs->zs_pf_dist > zfetch_max_distance)
+ zs->zs_pf_dist = zfetch_max_distance;
+ pf_nblks = zs->zs_pf_dist >> dbs;
} else {
pf_nblks = 0;
}
-
- zs->zs_pf_blkid = pf_start + pf_nblks;
+ if (zs->zs_pf_start < end_blkid)
+ zs->zs_pf_start = end_blkid;
+ if (zs->zs_pf_end < end_blkid + pf_nblks)
+ zs->zs_pf_end = end_blkid + pf_nblks;
/*
- * Do the same for indirects, starting from where we stopped last,
- * or where we will stop reading data blocks (and the indirects
- * that point to them).
+ * Do the same for indirects, starting where we will stop reading
+ * data blocks (and the indirects that point to them).
*/
- ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
- max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
- /*
- * We want to double our distance ahead of the data prefetch
- * (or reader, if we are not prefetching data). Previously, we
- * were (zs_ipf_blkid - blkid) ahead. To double that, we read
- * that amount again, plus the amount we are catching up by
- * (i.e. the amount read now + the amount of data prefetched now).
- */
- pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
- max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid);
- ipf_nblks = MIN(pf_ahead_blks, max_blks);
- zs->zs_ipf_blkid = ipf_start + ipf_nblks;
-
- zs->zs_blkid = end_of_access_blkid;
- /* Protect the stream from reclamation. */
- zs->zs_atime = gethrtime();
+ if (unlikely(zs->zs_ipf_dist < nbytes))
+ zs->zs_ipf_dist = nbytes;
+ else
+ zs->zs_ipf_dist *= 2;
+ if (zs->zs_ipf_dist > zfetch_max_idistance)
+ zs->zs_ipf_dist = zfetch_max_idistance;
+ pf_nblks = zs->zs_ipf_dist >> dbs;
+ if (zs->zs_ipf_start < zs->zs_pf_end)
+ zs->zs_ipf_start = zs->zs_pf_end;
+ if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
+ zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
+
zfs_refcount_add(&zs->zs_refs, NULL);
/* Count concurrent callers. */
zfs_refcount_add(&zs->zs_callers, NULL);
@@ -441,15 +660,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
-
- ZFETCHSTAT_BUMP(zfetchstat_hits);
return (zs);
}
void
-dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
+ boolean_t have_lock)
{
- zfetch_t *zf = zs->zs_fetch;
int64_t pf_start, pf_end, ipf_start, ipf_end;
int epbs, issued;
@@ -470,13 +687,13 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
mutex_enter(&zf->zf_lock);
if (zs->zs_missed) {
- pf_start = zs->zs_pf_blkid1;
- pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
+ pf_start = zs->zs_pf_start;
+ pf_end = zs->zs_pf_start = zs->zs_pf_end;
} else {
pf_start = pf_end = 0;
}
- ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1);
- ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid;
+ ipf_start = zs->zs_ipf_start;
+ ipf_end = zs->zs_ipf_start = zs->zs_ipf_end;
mutex_exit(&zf->zf_lock);
ASSERT3S(pf_start, <=, pf_end);
ASSERT3S(ipf_start, <=, ipf_end);
@@ -488,14 +705,14 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
issued = pf_end - pf_start + ipf_end - ipf_start;
if (issued > 1) {
/* More references on top of taken in dmu_zfetch_prepare(). */
- for (int i = 0; i < issued - 1; i++)
- zfs_refcount_add(&zs->zs_refs, NULL);
+ zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL);
} else if (issued == 0) {
/* Some other thread has done our work, so drop the ref. */
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
dmu_zfetch_stream_fini(zs);
return;
}
+ aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);
if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
@@ -503,13 +720,11 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
issued = 0;
for (int64_t blk = pf_start; blk < pf_end; blk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
- dmu_zfetch_stream_done, zs);
+ ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
}
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
- dmu_zfetch_stream_done, zs);
+ ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
}
if (!have_lock)
@@ -527,10 +742,9 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
if (zs)
- dmu_zfetch_run(zs, missed, have_lock);
+ dmu_zfetch_run(zf, zs, missed, have_lock);
}
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
"Disable all ZFS prefetching");
@@ -540,12 +754,20 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
"Min time before stream reclaim");
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW,
+ "Max time before stream delete");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW,
+ "Min bytes to prefetch per stream");
+
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
"Max bytes to prefetch per stream");
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
"Max bytes to prefetch indirects for per stream");
-ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW,
- "Number of bytes in a array_read");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
+ "Max request reorder distance within a stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
+ "Max log2 fraction of holes in a stream");
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index db1a5d71df3c..a703fd414f87 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -71,6 +71,8 @@ dnode_stats_t dnode_stats = {
{ "dnode_move_active", KSTAT_DATA_UINT64 },
};
+dnode_sums_t dnode_sums;
+
static kstat_t *dnode_ksp;
static kmem_cache_t *dnode_cache;
@@ -97,6 +99,14 @@ dbuf_compare(const void *x1, const void *x2)
if (likely(cmp))
return (cmp);
+ if (d1->db_state == DB_MARKER) {
+ ASSERT3S(d2->db_state, !=, DB_MARKER);
+ return (TREE_PCMP(d1->db_parent, d2));
+ } else if (d2->db_state == DB_MARKER) {
+ ASSERT3S(d1->db_state, !=, DB_MARKER);
+ return (TREE_PCMP(d1, d2->db_parent));
+ }
+
if (d1->db_state == DB_SEARCH) {
ASSERT3S(d2->db_state, !=, DB_SEARCH);
return (-1);
@@ -108,12 +118,11 @@ dbuf_compare(const void *x1, const void *x2)
return (TREE_PCMP(d1, d2));
}
-/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
dnode_t *dn = arg;
- int i;
rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -129,17 +138,17 @@ dnode_cons(void *arg, void *unused, int kmflag)
zfs_refcount_create(&dn->dn_tx_holds);
list_link_init(&dn->dn_link);
- bzero(&dn->dn_next_type[0], sizeof (dn->dn_next_type));
- bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
- bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
- bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
- bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
- bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
- bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
- bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
- bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
+ memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));
+ memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));
+ memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));
+ memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));
+ memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));
+ memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));
+ memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));
+ memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));
+ memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));
- for (i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++) {
multilist_link_init(&dn->dn_dirty_link[i]);
dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
@@ -174,11 +183,10 @@ dnode_cons(void *arg, void *unused, int kmflag)
return (0);
}
-/* ARGSUSED */
static void
dnode_dest(void *arg, void *unused)
{
- int i;
+ (void) unused;
dnode_t *dn = arg;
rw_destroy(&dn->dn_struct_rwlock);
@@ -190,7 +198,7 @@ dnode_dest(void *arg, void *unused)
zfs_refcount_destroy(&dn->dn_tx_holds);
ASSERT(!list_link_active(&dn->dn_link));
- for (i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++) {
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
@@ -227,6 +235,72 @@ dnode_dest(void *arg, void *unused)
avl_destroy(&dn->dn_dbufs);
}
+static int
+dnode_kstats_update(kstat_t *ksp, int rw)
+{
+ dnode_stats_t *ds = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+ ds->dnode_hold_dbuf_hold.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);
+ ds->dnode_hold_dbuf_read.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_dbuf_read);
+ ds->dnode_hold_alloc_hits.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_hits);
+ ds->dnode_hold_alloc_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_misses);
+ ds->dnode_hold_alloc_interior.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_interior);
+ ds->dnode_hold_alloc_lock_retry.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);
+ ds->dnode_hold_alloc_lock_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);
+ ds->dnode_hold_alloc_type_none.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);
+ ds->dnode_hold_free_hits.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_hits);
+ ds->dnode_hold_free_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_misses);
+ ds->dnode_hold_free_lock_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);
+ ds->dnode_hold_free_lock_retry.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);
+ ds->dnode_hold_free_refcount.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_refcount);
+ ds->dnode_hold_free_overflow.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_overflow);
+ ds->dnode_free_interior_lock_retry.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);
+ ds->dnode_allocate.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_allocate);
+ ds->dnode_reallocate.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_reallocate);
+ ds->dnode_buf_evict.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_buf_evict);
+ ds->dnode_alloc_next_chunk.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_alloc_next_chunk);
+ ds->dnode_alloc_race.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_alloc_race);
+ ds->dnode_alloc_next_block.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_alloc_next_block);
+ ds->dnode_move_invalid.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_invalid);
+ ds->dnode_move_recheck1.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_recheck1);
+ ds->dnode_move_recheck2.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_recheck2);
+ ds->dnode_move_special.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_special);
+ ds->dnode_move_handle.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_handle);
+ ds->dnode_move_rwlock.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_rwlock);
+ ds->dnode_move_active.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_active);
+ return (0);
+}
+
void
dnode_init(void)
{
@@ -235,11 +309,41 @@ dnode_init(void)
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
kmem_cache_set_move(dnode_cache, dnode_move);
+ wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);
+ wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);
+ wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);
+ wmsum_init(&dnode_sums.dnode_allocate, 0);
+ wmsum_init(&dnode_sums.dnode_reallocate, 0);
+ wmsum_init(&dnode_sums.dnode_buf_evict, 0);
+ wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);
+ wmsum_init(&dnode_sums.dnode_alloc_race, 0);
+ wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);
+ wmsum_init(&dnode_sums.dnode_move_invalid, 0);
+ wmsum_init(&dnode_sums.dnode_move_recheck1, 0);
+ wmsum_init(&dnode_sums.dnode_move_recheck2, 0);
+ wmsum_init(&dnode_sums.dnode_move_special, 0);
+ wmsum_init(&dnode_sums.dnode_move_handle, 0);
+ wmsum_init(&dnode_sums.dnode_move_rwlock, 0);
+ wmsum_init(&dnode_sums.dnode_move_active, 0);
+
dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (dnode_ksp != NULL) {
dnode_ksp->ks_data = &dnode_stats;
+ dnode_ksp->ks_update = dnode_kstats_update;
kstat_install(dnode_ksp);
}
}
@@ -252,6 +356,35 @@ dnode_fini(void)
dnode_ksp = NULL;
}
+ wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);
+ wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);
+ wmsum_fini(&dnode_sums.dnode_hold_free_hits);
+ wmsum_fini(&dnode_sums.dnode_hold_free_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);
+ wmsum_fini(&dnode_sums.dnode_hold_free_refcount);
+ wmsum_fini(&dnode_sums.dnode_hold_free_overflow);
+ wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);
+ wmsum_fini(&dnode_sums.dnode_allocate);
+ wmsum_fini(&dnode_sums.dnode_reallocate);
+ wmsum_fini(&dnode_sums.dnode_buf_evict);
+ wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);
+ wmsum_fini(&dnode_sums.dnode_alloc_race);
+ wmsum_fini(&dnode_sums.dnode_alloc_next_block);
+ wmsum_fini(&dnode_sums.dnode_move_invalid);
+ wmsum_fini(&dnode_sums.dnode_move_recheck1);
+ wmsum_fini(&dnode_sums.dnode_move_recheck2);
+ wmsum_fini(&dnode_sums.dnode_move_special);
+ wmsum_fini(&dnode_sums.dnode_move_handle);
+ wmsum_fini(&dnode_sums.dnode_move_rwlock);
+ wmsum_fini(&dnode_sums.dnode_move_active);
+
kmem_cache_destroy(dnode_cache);
dnode_cache = NULL;
}
@@ -319,7 +452,7 @@ dnode_byteswap(dnode_phys_t *dnp)
int i;
if (dnp->dn_type == DMU_OT_NONE) {
- bzero(dnp, sizeof (dnode_phys_t));
+ memset(dnp, 0, sizeof (dnode_phys_t));
return;
}
@@ -344,20 +477,11 @@ dnode_byteswap(dnode_phys_t *dnp)
* dnode dnode is smaller than a regular dnode.
*/
if (dnp->dn_bonuslen != 0) {
- /*
- * Note that the bonus length calculated here may be
- * longer than the actual bonus buffer. This is because
- * we always put the bonus buffer after the last block
- * pointer (instead of packing it against the end of the
- * dnode buffer).
- */
- int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- int slots = dnp->dn_extra_slots + 1;
- size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
dmu_object_byteswap_t byteswap;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
- dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
+ dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
+ DN_MAX_BONUS_LEN(dnp));
}
/* Swap SPILL block if we have one */
@@ -397,7 +521,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
/* clear any data after the end of the new size */
size_t diff = dn->dn_bonuslen - newsize;
char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
- bzero(data_end, diff);
+ memset(data_end, 0, diff);
}
dn->dn_bonuslen = newsize;
@@ -598,12 +722,13 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
DNODE_STAT_BUMP(dnode_allocate);
ASSERT(dn->dn_type == DMU_OT_NONE);
- ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
ASSERT(ot != DMU_OT_NONE);
ASSERT(DMU_OT_IS_VALID(ot));
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0) ||
+ (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
@@ -751,8 +876,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
static void
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
{
- int i;
-
ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
@@ -776,29 +899,29 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ndn->dn_datablksz = odn->dn_datablksz;
ndn->dn_maxblkid = odn->dn_maxblkid;
ndn->dn_num_slots = odn->dn_num_slots;
- bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+ memcpy(ndn->dn_next_type, odn->dn_next_type,
sizeof (odn->dn_next_type));
- bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,
sizeof (odn->dn_next_nblkptr));
- bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,
sizeof (odn->dn_next_nlevels));
- bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,
sizeof (odn->dn_next_indblkshift));
- bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,
sizeof (odn->dn_next_bonustype));
- bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,
sizeof (odn->dn_rm_spillblk));
- bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,
sizeof (odn->dn_next_bonuslen));
- bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,
sizeof (odn->dn_next_blksz));
- bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
+ memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,
sizeof (odn->dn_next_maxblkid));
- for (i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++) {
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
}
- bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+ memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,
sizeof (odn->dn_free_ranges));
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
@@ -852,7 +975,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
/*
* Satisfy the destructor.
*/
- for (i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++) {
list_create(&odn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
@@ -889,7 +1012,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
odn->dn_moved = (uint8_t)-1;
}
-/*ARGSUSED*/
static kmem_cbrc_t
dnode_move(void *buf, void *newbuf, size_t size, void *arg)
{
@@ -1123,9 +1245,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
return (B_TRUE);
}
-static void
+static uint_t
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
{
+ uint_t reclaimed = 0;
+
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
for (int i = idx; i < idx + slots; i++) {
@@ -1137,8 +1261,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
dnode_destroy(dnh->dnh_dnode);
dnh->dnh_dnode = DN_SLOT_FREE;
+ reclaimed++;
}
}
+
+ return (reclaimed);
}
void
@@ -1156,7 +1283,7 @@ dnode_free_interior_slots(dnode_t *dn)
while (!dnode_slots_tryenter(children, idx, slots)) {
DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
}
dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
@@ -1273,7 +1400,7 @@ dnode_buf_evict_async(void *dbu)
*/
int
dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
- void *tag, dnode_t **dnp)
+ const void *tag, dnode_t **dnp)
{
int epb, idx, err;
int drop_struct_lock = FALSE;
@@ -1437,7 +1564,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
dnode_slots_rele(dnc, idx, slots);
while (!dnode_slots_tryenter(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
}
/*
@@ -1451,6 +1578,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
+ dmu_buf_add_user_size(&db->db,
+ sizeof (dnode_t));
}
}
@@ -1492,7 +1621,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
dnode_slots_rele(dnc, idx, slots);
while (!dnode_slots_tryenter(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
}
if (!dnode_check_slots_free(dnc, idx, slots)) {
@@ -1508,8 +1637,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* to be freed. Single slot dnodes can be safely
* re-purposed as a performance optimization.
*/
- if (slots > 1)
- dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+ if (slots > 1) {
+ uint_t reclaimed =
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+ if (reclaimed > 0)
+ dmu_buf_sub_user_size(&db->db,
+ reclaimed * sizeof (dnode_t));
+ }
dnh = &dnc->dnc_children[idx];
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
@@ -1517,6 +1651,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
+ dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
}
mutex_enter(&dn->dn_mtx);
@@ -1567,7 +1702,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* Return held dnode if the object is allocated, NULL if not.
*/
int
-dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
{
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
dnp));
@@ -1579,7 +1714,7 @@ dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
* new reference.
*/
boolean_t
-dnode_add_ref(dnode_t *dn, void *tag)
+dnode_add_ref(dnode_t *dn, const void *tag)
{
mutex_enter(&dn->dn_mtx);
if (zfs_refcount_is_zero(&dn->dn_holds)) {
@@ -1592,14 +1727,14 @@ dnode_add_ref(dnode_t *dn, void *tag)
}
void
-dnode_rele(dnode_t *dn, void *tag)
+dnode_rele(dnode_t *dn, const void *tag)
{
mutex_enter(&dn->dn_mtx);
dnode_rele_and_unlock(dn, tag, B_FALSE);
}
void
-dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
+dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)
{
uint64_t refs;
/* Get while the hold prevents the dnode from moving. */
@@ -1621,7 +1756,9 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
* other direct or indirect hold on the dnode must first drop the dnode
* handle.
*/
+#ifdef ZFS_DEBUG
ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+#endif
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && db != NULL) {
@@ -1649,7 +1786,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
}
/*
- * Checks if the dnode contains any uncommitted dirty records.
+ * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
+ * It is important to check both conditions, as some operations (eg appending
+ * to a file) can dirty both as a single logical unit, but they are not synced
+ * out atomically, so checking one and not the other can result in an object
+ * appearing to be clean mid-way through a commit.
+ *
+ * Do not change this lightly! If you get it wrong, dmu_offset_next() can
+ * detect a hole where there is really data, leading to silent corruption.
*/
boolean_t
dnode_is_dirty(dnode_t *dn)
@@ -1657,7 +1801,8 @@ dnode_is_dirty(dnode_t *dn)
mutex_enter(&dn->dn_mtx);
for (int i = 0; i < TXG_SIZE; i++) {
- if (multilist_link_active(&dn->dn_dirty_link[i])) {
+ if (multilist_link_active(&dn->dn_dirty_link[i]) ||
+ !list_is_empty(&dn->dn_dirty_records[i])) {
mutex_exit(&dn->dn_mtx);
return (B_TRUE);
}
@@ -1767,7 +1912,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs == dn->dn_indblkshift)
ibs = 0;
- if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ if (size == dn->dn_datablksz && ibs == 0)
return (0);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
@@ -1790,24 +1935,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs && dn->dn_nlevels != 1)
goto fail;
- /* resize the old block */
- err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
- if (err == 0) {
- dbuf_new_size(db, size, tx);
- } else if (err != ENOENT) {
- goto fail;
- }
-
- dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (size != dn->dn_datablksz) {
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+ if (err == 0) {
+ dbuf_new_size(db, size, tx);
+ } else if (err != ENOENT) {
+ goto fail;
+ }
+
+ dnode_setdblksz(dn, size);
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
+ if (db)
+ dbuf_rele(db, FTAG);
+ }
if (ibs) {
dn->dn_indblkshift = ibs;
- dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
}
- /* release after we have fixed the blocksize in the dnode */
- if (db)
- dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock);
return (0);
@@ -2032,7 +2178,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
}
void
-dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag)
+dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
{
/*
* Don't set dirtyctx to SYNC if we're just modifying this as we
@@ -2082,7 +2228,7 @@ dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
dmu_buf_will_dirty(&db->db, tx);
data = db->db.db_data;
- bzero(data + blkoff, len);
+ memset(data + blkoff, 0, len);
}
dbuf_rele(db, FTAG);
}
@@ -2292,19 +2438,11 @@ dnode_spill_freed(dnode_t *dn)
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
{
- void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
if (blkid == DMU_BONUS_BLKID)
return (FALSE);
- /*
- * If we're in the process of opening the pool, dp will not be
- * set yet, but there shouldn't be anything dirty.
- */
- if (dp == NULL)
- return (FALSE);
-
if (dn->dn_free_txg)
return (TRUE);
@@ -2419,7 +2557,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
}
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
- db->db_blkptr->blk_birth <= txg ||
+ BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
BP_IS_HOLE(db->db_blkptr))) {
/*
* This can only happen when we are searching up the tree
@@ -2467,7 +2605,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
i >= 0 && i < epb; i += inc) {
if (BP_GET_FILL(&bp[i]) >= minfill &&
BP_GET_FILL(&bp[i]) <= maxfill &&
- (hole || bp[i].blk_birth > txg))
+ (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
break;
if (inc > 0 || *offset > 0)
*offset += inc;
@@ -2481,8 +2619,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
if (inc < 0) {
/* traversing backwards; position offset at the end */
- ASSERT3U(*offset, <=, start);
- *offset = MIN(*offset + (1ULL << span) - 1, start);
+ if (span < 8 * sizeof (*offset))
+ *offset = MIN(*offset + (1ULL << span) - 1,
+ start);
} else if (*offset < start) {
*offset = start;
}
@@ -2589,3 +2728,8 @@ EXPORT_SYMBOL(dnode_free_range);
EXPORT_SYMBOL(dnode_evict_dbufs);
EXPORT_SYMBOL(dnode_evict_bonus);
#endif
+
+ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,
+ "Default dnode block shift");
+ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,
+ "Default dnode indirect block shift");
diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c
index dd37e3af7ed5..f67dad002319 100644
--- a/sys/contrib/openzfs/module/zfs/dnode_sync.c
+++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -70,8 +70,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
for (i = 0; i < nblkptr; i++) {
- children[i] =
- dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+ children[i] = dbuf_find(dn->dn_objset, dn->dn_object,
+ old_toplvl, i, NULL);
}
/* transfer dnode's block pointers to new indirect block */
@@ -82,7 +82,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
ASSERT(db->db.db_data);
ASSERT(arc_released(db->db_buf));
ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
- bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ memcpy(db->db.db_data, dn->dn_phys->dn_blkptr,
sizeof (blkptr_t) * nblkptr);
arc_buf_freeze(db->db_buf);
@@ -119,7 +119,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&child->db_mtx);
}
- bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+ memset(dn->dn_phys->dn_blkptr, 0, sizeof (blkptr_t) * nblkptr);
rw_exit(&db->db_rwlock);
if (dn->dn_dbuf != NULL)
@@ -158,7 +158,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
dmu_object_type_t type = BP_GET_TYPE(bp);
uint64_t lvl = BP_GET_LEVEL(bp);
- bzero(bp, sizeof (blkptr_t));
+ memset(bp, 0, sizeof (blkptr_t));
if (spa_feature_is_active(dn->dn_objset->os_spa,
SPA_FEATURE_HOLE_BIRTH)) {
@@ -175,19 +175,21 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
static void
free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
{
- int off, num;
- int i, err, epbs;
+ uint64_t off, num, i, j;
+ unsigned int epbs;
+ int err;
uint64_t txg = tx->tx_txg;
dnode_t *dn;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- off = start - (db->db_blkid * 1<<epbs);
+ off = start - (db->db_blkid << epbs);
num = end - start + 1;
- ASSERT3U(off, >=, 0);
- ASSERT3U(num, >=, 0);
+ ASSERT3U(dn->dn_phys->dn_indblkshift, >=, SPA_BLKPTRSHIFT);
+ ASSERT3U(end + 1, >=, start);
+ ASSERT3U(start, >=, (db->db_blkid << epbs));
ASSERT3U(db->db_level, >, 0);
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
@@ -197,7 +199,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
uint64_t *buf;
dmu_buf_impl_t *child;
dbuf_dirty_record_t *dr;
- int j;
ASSERT(db->db_level == 1);
@@ -217,8 +218,11 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
for (j = 0; j < child->db.db_size >> 3; j++) {
if (buf[j] != 0) {
panic("freed data not zero: "
- "child=%p i=%d off=%d num=%d\n",
- (void *)child, i, off, num);
+ "child=%p i=%llu off=%llu "
+ "num=%llu\n",
+ (void *)child, (u_longlong_t)i,
+ (u_longlong_t)off,
+ (u_longlong_t)num);
}
}
}
@@ -234,8 +238,11 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
for (j = 0; j < child->db.db_size >> 3; j++) {
if (buf[j] != 0) {
panic("freed data not zero: "
- "child=%p i=%d off=%d num=%d\n",
- (void *)child, i, off, num);
+ "child=%p i=%llu off=%llu "
+ "num=%llu\n",
+ (void *)child, (u_longlong_t)i,
+ (u_longlong_t)off,
+ (u_longlong_t)num);
}
}
}
@@ -347,7 +354,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
rw_enter(&db->db_rwlock, RW_WRITER);
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
ASSERT(BP_IS_HOLE(bp));
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
free_blocks(dn, db->db_blkptr, 1, tx);
rw_exit(&db->db_rwlock);
}
@@ -475,7 +482,14 @@ dnode_evict_dbufs(dnode_t *dn)
zfs_refcount_is_zero(&db->db_holds)) {
db_marker->db_level = db->db_level;
db_marker->db_blkid = db->db_blkid;
- db_marker->db_state = DB_SEARCH;
+ /*
+ * Insert a MARKER node with the same level and blkid.
+ * And to resolve any ties in dbuf_compare() use the
+ * pointer of the dbuf that we are evicting. Pass the
+ * address in db_parent.
+ */
+ db_marker->db_state = DB_MARKER;
+ db_marker->db_parent = (void *)((uintptr_t)db - 1);
avl_insert_here(&dn->dn_dbufs, db_marker, db,
AVL_BEFORE);
@@ -597,7 +611,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_free_txg > 0);
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
- bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+ memset(dn->dn_phys, 0, sizeof (dnode_phys_t) * dn->dn_num_slots);
dnode_free_interior_slots(dn);
mutex_enter(&dn->dn_mtx);
@@ -620,6 +634,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
/*
* Write out the dnode's dirty buffers.
+ * Does not wait for zio completions.
*/
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
@@ -634,7 +649,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
ASSERT(dnp->dn_type != DMU_OT_NONE ||
- bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
+ memcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
DNODE_VERIFY(dn);
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
@@ -655,8 +670,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
DNODE_FLAG_USEROBJUSED_ACCOUNTED;
mutex_exit(&dn->dn_mtx);
dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
- } else {
- /* Once we account for it, we should always account for it */
+ } else if (!(os->os_encrypted && dmu_objset_is_receiving(os))) {
+ /*
+ * Once we account for it, we should always account for it,
+ * except for the case of a raw receive. We will not be able
+ * to account for it until the receiving dataset has been
+ * mounted.
+ */
ASSERT(!(dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED));
ASSERT(!(dn->dn_phys->dn_flags &
@@ -822,7 +842,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_allocated_txg == tx->tx_txg);
if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
/* zero the new blkptrs we are gaining */
- bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+ memset(dnp->dn_blkptr + dnp->dn_nblkptr, 0,
sizeof (blkptr_t) *
(dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
#ifdef ZFS_DEBUG
@@ -849,6 +869,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
}
+ ASSERT3U(dnp->dn_bonuslen, <=, DN_MAX_BONUS_LEN(dnp));
+
/*
* Although we have dropped our reference to the dnode, it
* can't be evicted until its written, and we haven't yet
diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
index bead7da2237f..5fd8bc2a2682 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
@@ -34,10 +34,11 @@
#include <sys/dsl_bookmark.h>
#include <zfs_namecheck.h>
#include <sys/dmu_send.h>
+#include <sys/dbuf.h>
static int
dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
- dsl_dataset_t **dsp, void *tag, char **shortnamep)
+ dsl_dataset_t **dsp, const void *tag, char **shortnamep)
{
char buf[ZFS_MAX_DATASET_NAME_LEN];
char *hashp;
@@ -82,7 +83,7 @@ dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname,
* Zero out the bookmark in case the one stored on disk
* is in an older, shorter format.
*/
- bzero(bmark_phys, sizeof (*bmark_phys));
+ memset(bmark_phys, 0, sizeof (*bmark_phys));
err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0,
@@ -160,15 +161,14 @@ dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source)
int
dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
{
- char *first;
- size_t first_len;
+ const char *first = NULL;
+ size_t first_len = 0;
- first = NULL;
for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
- char *bmark = nvpair_name(pair);
- char *source;
+ const char *bmark = nvpair_name(pair);
+ const char *source;
/* list structure: values must be snapshots XOR bookmarks */
if (nvpair_value_string(pair, &source) != 0)
@@ -178,7 +178,7 @@ dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
/* same pool check */
if (first == NULL) {
- char *cp = strpbrk(bmark, "/#");
+ const char *cp = strpbrk(bmark, "/#");
if (cp == NULL)
return (-1);
first = bmark;
@@ -230,7 +230,6 @@ dsl_bookmark_create_check_impl(dsl_pool_t *dp,
switch (error) {
case ESRCH:
/* happy path: new bmark doesn't exist, proceed after switch */
- error = 0;
break;
case 0:
error = SET_ERROR(EEXIST);
@@ -307,11 +306,11 @@ dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
- char *new = nvpair_name(pair);
+ const char *new = nvpair_name(pair);
int error = schema_err;
if (error == 0) {
- char *source = fnvpair_value_string(pair);
+ const char *source = fnvpair_value_string(pair);
error = dsl_bookmark_create_check_impl(dp, new, source);
if (error != 0)
error = SET_ERROR(error);
@@ -347,6 +346,8 @@ dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
spa_t *spa = dsl_dataset_get_spa(snap);
objset_t *mos = spa_get_dsl(spa)->dp_meta_objset;
dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap);
+
+ memset(zbm, 0, sizeof (zfs_bookmark_phys_t));
zbm->zbm_guid = dsp->ds_guid;
zbm->zbm_creation_txg = dsp->ds_creation_txg;
zbm->zbm_creation_time = dsp->ds_creation_time;
@@ -380,10 +381,6 @@ dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
&zbm->zbm_compressed_freed_before_next_snap,
&zbm->zbm_uncompressed_freed_before_next_snap);
dsl_dataset_rele(nextds, FTAG);
- } else {
- bzero(&zbm->zbm_flags,
- sizeof (zfs_bookmark_phys_t) -
- offsetof(zfs_bookmark_phys_t, zbm_flags));
}
}
@@ -426,8 +423,8 @@ dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx);
}
- __attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 };
- ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
+ zfs_bookmark_phys_t zero_phys = { 0 };
+ ASSERT0(memcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
&zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size));
VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name,
@@ -441,8 +438,8 @@ dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
*/
static void
dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
- dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag,
- redaction_list_t **redaction_list)
+ dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps,
+ const void *tag, redaction_list_t **redaction_list)
{
dsl_pool_t *dp = dmu_tx_pool(tx);
objset_t *mos = dp->dp_meta_objset;
@@ -463,26 +460,43 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps);
if (redaction_list != NULL || bookmark_redacted) {
redaction_list_t *local_rl;
+ boolean_t spill = B_FALSE;
if (bookmark_redacted) {
redact_snaps = dsredactsnaps;
num_redact_snaps = dsnumsnaps;
}
+ int bonuslen = sizeof (redaction_list_phys_t) +
+ num_redact_snaps * sizeof (uint64_t);
+ if (bonuslen > dmu_bonus_max())
+ spill = B_TRUE;
dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos,
DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
- DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) +
- num_redact_snaps * sizeof (uint64_t), tx);
+ DMU_OTN_UINT64_METADATA, spill ? 0 : bonuslen, tx);
spa_feature_incr(dp->dp_spa,
SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+ if (spill) {
+ spa_feature_incr(dp->dp_spa,
+ SPA_FEATURE_REDACTION_LIST_SPILL, tx);
+ }
VERIFY0(dsl_redaction_list_hold_obj(dp,
dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl));
dsl_redaction_list_long_hold(dp, local_rl, tag);
- ASSERT3U((local_rl)->rl_dbuf->db_size, >=,
- sizeof (redaction_list_phys_t) + num_redact_snaps *
- sizeof (uint64_t));
- dmu_buf_will_dirty(local_rl->rl_dbuf, tx);
- bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps,
+ if (!spill) {
+ ASSERT3U(local_rl->rl_bonus->db_size, >=, bonuslen);
+ dmu_buf_will_dirty(local_rl->rl_bonus, tx);
+ } else {
+ dmu_buf_t *db;
+ VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus,
+ DB_RF_MUST_SUCCEED, FTAG, &db));
+ dmu_buf_will_fill(db, tx, B_FALSE);
+ VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen,
+ SPA_MINBLOCKSIZE), tx));
+ local_rl->rl_phys = db->db_data;
+ local_rl->rl_dbuf = db;
+ }
+ memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps,
sizeof (uint64_t) * num_redact_snaps);
local_rl->rl_phys->rlp_num_snaps = num_redact_snaps;
if (bookmark_redacted) {
@@ -593,8 +607,8 @@ dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
- char *new = nvpair_name(pair);
- char *source = fnvpair_value_string(pair);
+ const char *new = nvpair_name(pair);
+ const char *source = fnvpair_value_string(pair);
if (strchr(source, '@') != NULL) {
dsl_bookmark_create_sync_impl_snap(new, source, tx,
@@ -640,11 +654,15 @@ dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx)
SPA_FEATURE_REDACTION_BOOKMARKS))
return (SET_ERROR(ENOTSUP));
/*
- * If the list of redact snaps will not fit in the bonus buffer with
- * the furthest reached object and offset, fail.
+ * If the list of redact snaps will not fit in the bonus buffer (or
+ * spill block, with the REDACTION_LIST_SPILL feature) with the
+ * furthest reached object and offset, fail.
*/
- if (dbcra->dbcra_numsnaps > (dmu_bonus_max() -
- sizeof (redaction_list_phys_t)) / sizeof (uint64_t))
+ uint64_t snaplimit = ((spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_REDACTION_LIST_SPILL) ? spa_maxblocksize(dp->dp_spa) :
+ dmu_bonus_max()) -
+ sizeof (redaction_list_phys_t)) / sizeof (uint64_t);
+ if (dbcra->dbcra_numsnaps > snaplimit)
return (SET_ERROR(E2BIG));
if (dsl_bookmark_create_nvl_validate_pair(
@@ -667,7 +685,8 @@ dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx)
int
dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot,
- uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl)
+ uint64_t numsnaps, uint64_t *snapguids, const void *tag,
+ redaction_list_t **rl)
{
dsl_bookmark_create_redacted_arg_t dbcra;
@@ -1043,6 +1062,14 @@ dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name,
}
if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+ dnode_t *rl;
+ VERIFY0(dnode_hold(mos,
+ dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
+ if (rl->dn_have_spill) {
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_REDACTION_LIST_SPILL, tx);
+ }
+ dnode_rele(rl, FTAG);
VERIFY0(dmu_object_free(mos,
dbn->dbn_phys.zbm_redaction_obj, tx));
spa_feature_decr(dmu_objset_spa(mos),
@@ -1191,19 +1218,19 @@ dsl_redaction_list_long_held(redaction_list_t *rl)
}
void
-dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag)
+dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl,
+ const void *tag)
{
ASSERT(dsl_pool_config_held(dp));
(void) zfs_refcount_add(&rl->rl_longholds, tag);
}
void
-dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag)
+dsl_redaction_list_long_rele(redaction_list_t *rl, const void *tag)
{
(void) zfs_refcount_remove(&rl->rl_longholds, tag);
}
-/* ARGSUSED */
static void
redaction_list_evict_sync(void *rlu)
{
@@ -1214,17 +1241,19 @@ redaction_list_evict_sync(void *rlu)
}
void
-dsl_redaction_list_rele(redaction_list_t *rl, void *tag)
+dsl_redaction_list_rele(redaction_list_t *rl, const void *tag)
{
- dmu_buf_rele(rl->rl_dbuf, tag);
+ if (rl->rl_bonus != rl->rl_dbuf)
+ dmu_buf_rele(rl->rl_dbuf, tag);
+ dmu_buf_rele(rl->rl_bonus, tag);
}
int
-dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag,
+dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag,
redaction_list_t **rlp)
{
objset_t *mos = dp->dp_meta_objset;
- dmu_buf_t *dbuf;
+ dmu_buf_t *dbuf, *spill_dbuf;
redaction_list_t *rl;
int err;
@@ -1239,13 +1268,18 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag,
redaction_list_t *winner = NULL;
rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP);
- rl->rl_dbuf = dbuf;
+ rl->rl_bonus = dbuf;
+ if (dmu_spill_hold_existing(dbuf, tag, &spill_dbuf) == 0) {
+ rl->rl_dbuf = spill_dbuf;
+ } else {
+ rl->rl_dbuf = dbuf;
+ }
rl->rl_object = rlobj;
- rl->rl_phys = dbuf->db_data;
+ rl->rl_phys = rl->rl_dbuf->db_data;
rl->rl_mos = dp->dp_meta_objset;
zfs_refcount_create(&rl->rl_longholds);
dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL,
- &rl->rl_dbuf);
+ &rl->rl_bonus);
if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) {
kmem_free(rl, sizeof (*rl));
rl = winner;
@@ -1295,7 +1329,7 @@ dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
* The empty-string name can't be in the AVL, and it compares
* before any entries with this TXG.
*/
- search.dbn_name = "";
+ search.dbn_name = (char *)"";
VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
dsl_bookmark_node_t *dbn =
avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
@@ -1422,7 +1456,7 @@ dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
* The empty-string name can't be in the AVL, and it compares
* before any entries with this TXG.
*/
- search.dbn_name = "";
+ search.dbn_name = (char *)"";
VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
dsl_bookmark_node_t *dbn =
avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
@@ -1470,10 +1504,11 @@ dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
* Adjust the FBN of any bookmarks that reference this block, whose "next"
* is the head dataset.
*/
-/* ARGSUSED */
void
dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
+ (void) tx;
+
/*
* Iterate over bookmarks whose "next" is the head dataset.
*/
@@ -1485,7 +1520,8 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
* If the block was live (referenced) at the time of this
* bookmark, add its space to the bookmark's FBN.
*/
- if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
+ if (BP_GET_LOGICAL_BIRTH(bp) <=
+ dbn->dbn_phys.zbm_creation_txg &&
(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
mutex_enter(&dbn->dbn_lock);
dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
index 26d4c2fe7e33..8e1055d9bcb1 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
@@ -80,13 +80,13 @@
int zfs_disable_ivset_guid_check = 0;
static void
-dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag)
+dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, const void *tag)
{
(void) zfs_refcount_add(&wkey->wk_refcnt, tag);
}
static void
-dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag)
+dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, const void *tag)
{
(void) zfs_refcount_remove(&wkey->wk_refcnt, tag);
}
@@ -97,7 +97,7 @@ dsl_wrapping_key_free(dsl_wrapping_key_t *wkey)
ASSERT0(zfs_refcount_count(&wkey->wk_refcnt));
if (wkey->wk_key.ck_data) {
- bzero(wkey->wk_key.ck_data,
+ memset(wkey->wk_key.ck_data, 0,
CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
kmem_free(wkey->wk_key.ck_data,
CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
@@ -119,9 +119,8 @@ dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat,
/* allocate and initialize the underlying crypto key */
wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP);
- wkey->wk_key.ck_format = CRYPTO_KEY_RAW;
wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN);
- bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN);
+ memcpy(wkey->wk_key.ck_data, wkeydata, WRAPPING_KEY_LEN);
/* initialize the rest of the struct */
zfs_refcount_create(&wkey->wk_refcnt);
@@ -144,7 +143,7 @@ dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props,
dsl_wrapping_key_t *wkey = NULL;
uint8_t *wkeydata = NULL;
uint_t wkeydata_len = 0;
- char *keylocation = NULL;
+ const char *keylocation = NULL;
dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP);
dcp->cp_cmd = cmd;
@@ -267,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b)
return (0);
}
+/*
+ * this compares a crypto key based on zk_guid. See comment on
+ * spa_crypto_key_compare for more information.
+ */
+boolean_t
+dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb)
+{
+ dsl_crypto_key_t *dcka = NULL;
+ dsl_crypto_key_t *dckb = NULL;
+ uint64_t obja, objb;
+ boolean_t equal;
+ spa_t *spa;
+
+ spa = dmu_objset_spa(osa);
+ if (spa != dmu_objset_spa(osb))
+ return (B_FALSE);
+ obja = dmu_objset_ds(osa)->ds_object;
+ objb = dmu_objset_ds(osb)->ds_object;
+
+ if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0)
+ return (B_FALSE);
+ if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) {
+ spa_keystore_dsl_key_rele(spa, dcka, FTAG);
+ return (B_FALSE);
+ }
+
+ equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid);
+
+ spa_keystore_dsl_key_rele(spa, dcka, FTAG);
+ spa_keystore_dsl_key_rele(spa, dckb, FTAG);
+
+ return (equal);
+}
+
static int
spa_key_mapping_compare(const void *a, const void *b)
{
@@ -369,7 +402,7 @@ dsl_dir_incompatible_encryption_version(dsl_dir_t *dd)
static int
spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj,
- void *tag, dsl_wrapping_key_t **wkey_out)
+ const void *tag, dsl_wrapping_key_t **wkey_out)
{
int ret;
dsl_wrapping_key_t search_wkey;
@@ -399,7 +432,7 @@ error:
}
static int
-spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, const void *tag,
dsl_wrapping_key_t **wkey_out)
{
int ret;
@@ -515,7 +548,7 @@ dsl_crypto_key_free(dsl_crypto_key_t *dck)
}
static void
-dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag)
+dsl_crypto_key_rele(dsl_crypto_key_t *dck, const void *tag)
{
if (zfs_refcount_remove(&dck->dck_holds, tag) == 0)
dsl_crypto_key_free(dck);
@@ -523,7 +556,7 @@ dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag)
static int
dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
- uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out)
+ uint64_t dckobj, const void *tag, dsl_crypto_key_t **dck_out)
{
int ret;
uint64_t crypt = 0, guid = 0, version = 0;
@@ -542,6 +575,12 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
if (ret != 0)
goto error;
+ /* handle a future crypto suite that we don't support */
+ if (crypt >= ZIO_CRYPT_FUNCTIONS) {
+ ret = (SET_ERROR(ZFS_ERR_CRYPTO_NOTSUP));
+ goto error;
+ }
+
ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid);
if (ret != 0)
goto error;
@@ -592,7 +631,7 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
error:
if (dck != NULL) {
- bzero(dck, sizeof (dsl_crypto_key_t));
+ memset(dck, 0, sizeof (dsl_crypto_key_t));
kmem_free(dck, sizeof (dsl_crypto_key_t));
}
@@ -601,7 +640,7 @@ error:
}
static int
-spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag,
+spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, const void *tag,
dsl_crypto_key_t **dck_out)
{
int ret;
@@ -632,7 +671,7 @@ error:
}
static int
-spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, const void *tag,
dsl_crypto_key_t **dck_out)
{
int ret;
@@ -690,7 +729,7 @@ spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
}
void
-spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag)
+spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, const void *tag)
{
rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
@@ -937,7 +976,7 @@ error:
}
void
-key_mapping_add_ref(dsl_key_mapping_t *km, void *tag)
+key_mapping_add_ref(dsl_key_mapping_t *km, const void *tag)
{
ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
zfs_refcount_add(&km->km_refcnt, tag);
@@ -954,7 +993,7 @@ key_mapping_add_ref(dsl_key_mapping_t *km, void *tag)
* mapping after unmounting a dataset.
*/
void
-key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag)
+key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, const void *tag)
{
ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
@@ -985,7 +1024,7 @@ key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag)
}
int
-spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag,
+spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, const void *tag,
dsl_key_mapping_t **km_out)
{
int ret;
@@ -1044,7 +1083,7 @@ spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag,
}
int
-spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag)
+spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, const void *tag)
{
int ret;
dsl_key_mapping_t search_km;
@@ -1082,7 +1121,7 @@ error_unlock:
* without getting a reference to it.
*/
int
-spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag,
+spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, const void *tag,
dsl_crypto_key_t **dck_out)
{
int ret;
@@ -1138,7 +1177,7 @@ dmu_objset_check_wkey_loaded(dsl_dir_t *dd)
return (0);
}
-static zfs_keystatus_t
+zfs_keystatus_t
dsl_dataset_get_keystatus(dsl_dir_t *dd)
{
/* check if this dd has a has a dsl key */
@@ -1507,7 +1546,7 @@ spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx)
dsl_crypto_params_t *dcp = skcka->skcka_cp;
dsl_wrapping_key_t *wkey = NULL, *found_wkey;
dsl_wrapping_key_t wkey_search;
- char *keylocation = dcp->cp_keylocation;
+ const char *keylocation = dcp->cp_keylocation;
uint64_t rddobj, new_rddobj;
/* create and initialize the wrapping key */
@@ -2007,14 +2046,6 @@ dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds,
if (ret != 0)
return (ret);
- /*
- * Useraccounting is not portable and must be done with the keys loaded.
- * Therefore, whenever we do any kind of receive the useraccounting
- * must not be present.
- */
- ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
- ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
-
mdn = DMU_META_DNODE(os);
/*
@@ -2104,8 +2135,9 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
* written out raw next time.
*/
arc_release(os->os_phys_buf, &os->os_phys_buf);
- bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN);
- bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN);
+ memcpy(os->os_phys->os_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+ memset(os->os_phys->os_local_mac, 0, ZIO_OBJSET_MAC_LEN);
+ os->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE;
os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
/* set metadnode compression and checksum */
@@ -2127,9 +2159,6 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dsl_dataset_sync(ds, zio, tx);
VERIFY0(zio_wait(zio));
-
- /* dsl_dataset_sync_done will drop this reference. */
- dmu_buf_add_ref(ds->ds_dbuf, ds);
dsl_dataset_sync_done(ds, tx);
}
}
@@ -2152,10 +2181,16 @@ dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
* wrapping key.
*/
ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval);
- if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS ||
- intval <= ZIO_CRYPT_OFF)
+ if (ret != 0 || intval <= ZIO_CRYPT_OFF)
return (SET_ERROR(EINVAL));
+ /*
+ * Flag a future crypto suite that we don't support differently, so
+ * we can return a more useful error to the user.
+ */
+ if (intval >= ZIO_CRYPT_FUNCTIONS)
+ return (SET_ERROR(ZFS_ERR_CRYPTO_NOTSUP));
+
ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval);
if (ret != 0)
return (SET_ERROR(EINVAL));
@@ -2237,7 +2272,7 @@ dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
uint8_t *keydata, *hmac_keydata, *iv, *mac;
uint64_t crypt, key_guid, keyformat, iters, salt;
uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION;
- char *keylocation = "prompt";
+ const char *keylocation = "prompt";
/* lookup the values we need to create the DSL Crypto Key */
crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE);
@@ -2555,7 +2590,7 @@ dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,
DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx));
zio_crypt_key_destroy(&dck.dck_key);
- bzero(&dck.dck_key, sizeof (zio_crypt_key_t));
+ memset(&dck.dck_key, 0, sizeof (zio_crypt_key_t));
return (dck.dck_obj);
}
@@ -2679,6 +2714,7 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj,
objset_phys_t *osp = buf;
uint8_t portable_mac[ZIO_OBJSET_MAC_LEN];
uint8_t local_mac[ZIO_OBJSET_MAC_LEN];
+ const uint8_t zeroed_mac[ZIO_OBJSET_MAC_LEN] = {0};
/* look up the key from the spa's keystore */
ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
@@ -2695,16 +2731,30 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj,
/* if we are generating encode the HMACs in the objset_phys_t */
if (generate) {
- bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN);
- bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN);
+ memcpy(osp->os_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+ memcpy(osp->os_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
abd_return_buf_copy(abd, buf, datalen);
return (0);
}
- if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 ||
- bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) {
- abd_return_buf(abd, buf, datalen);
- return (SET_ERROR(ECKSUM));
+ if (memcmp(portable_mac, osp->os_portable_mac,
+ ZIO_OBJSET_MAC_LEN) != 0 ||
+ memcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) {
+ /*
+ * If the MAC is zeroed out, we failed to decrypt it.
+ * This should only arise, at least on Linux,
+ * if we hit edge case handling for useraccounting, since we
+ * shouldn't get here without bailing out on error earlier
+ * otherwise.
+ *
+ * So if we're in that case, we can just fall through and
+ * special-casing noticing that it's zero will handle it
+ * elsewhere, since we can just regenerate it.
+ */
+ if (memcmp(local_mac, zeroed_mac, ZIO_OBJSET_MAC_LEN) != 0) {
+ abd_return_buf(abd, buf, datalen);
+ return (SET_ERROR(ECKSUM));
+ }
}
abd_return_buf(abd, buf, datalen);
@@ -2746,11 +2796,11 @@ spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd,
* Otherwise verify that the MAC matched what we expected.
*/
if (generate) {
- bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN);
+ memcpy(mac, digestbuf, ZIO_DATA_MAC_LEN);
return (0);
}
- if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0)
+ if (memcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0)
return (SET_ERROR(ECKSUM));
return (0);
@@ -2849,9 +2899,9 @@ spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb,
error:
if (encrypt) {
/* zero out any state we might have changed while encrypting */
- bzero(salt, ZIO_DATA_SALT_LEN);
- bzero(iv, ZIO_DATA_IV_LEN);
- bzero(mac, ZIO_DATA_MAC_LEN);
+ memset(salt, 0, ZIO_DATA_SALT_LEN);
+ memset(iv, 0, ZIO_DATA_IV_LEN);
+ memset(mac, 0, ZIO_DATA_MAC_LEN);
abd_return_buf(pabd, plainbuf, datalen);
abd_return_buf_copy(cabd, cipherbuf, datalen);
} else {
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
index f99964511aa6..b4de0e7ff073 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -73,13 +73,22 @@
* The SPA supports block sizes up to 16MB. However, very large blocks
* can have an impact on i/o latency (e.g. tying up a spinning disk for
* ~300ms), and also potentially on the memory allocator. Therefore,
- * we do not allow the recordsize to be set larger than zfs_max_recordsize
- * (default 1MB). Larger blocks can be created by changing this tunable,
- * and pools with larger blocks can always be imported and used, regardless
- * of this setting.
+ * we did not allow the recordsize to be set larger than zfs_max_recordsize
+ * (former default: 1MB). Larger blocks could be created by changing this
+ * tunable, and pools with larger blocks could always be imported and used,
+ * regardless of this setting.
+ *
+ * We do, however, still limit it by default to 1M on x86_32, because Linux's
+ * 3/1 memory split doesn't leave much room for 16M chunks.
*/
-int zfs_max_recordsize = 1 * 1024 * 1024;
-int zfs_allow_redacted_dataset_mount = 0;
+#ifdef _ILP32
+uint_t zfs_max_recordsize = 1 * 1024 * 1024;
+#else
+uint_t zfs_max_recordsize = 16 * 1024 * 1024;
+#endif
+static int zfs_allow_redacted_dataset_mount = 0;
+
+int zfs_snapshot_history_enabled = 1;
#define SWITCH64(x, y) \
{ \
@@ -90,8 +99,6 @@ int zfs_allow_redacted_dataset_mount = 0;
#define DS_REF_MAX (1ULL << 62)
-extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
-
static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
uint64_t obj, dmu_tx_t *tx);
static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
@@ -99,7 +106,7 @@ static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f);
-extern int spa_asize_inflation;
+extern uint_t spa_asize_inflation;
static zil_header_t zero_zil;
@@ -149,7 +156,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
return;
}
- ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
@@ -183,7 +191,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+ BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@@ -229,7 +237,7 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
mutex_exit(&ds->ds_remap_deadlist_lock);
BP_ZERO(&fakebp);
- fakebp.blk_birth = birth;
+ BP_SET_LOGICAL_BIRTH(&fakebp, birth);
DVA_SET_VDEV(dva, vdev);
DVA_SET_OFFSET(dva, offset);
DVA_SET_ASIZE(dva, size);
@@ -252,7 +260,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
return (0);
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(bp->blk_birth <= tx->tx_txg);
+ ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
if (ds == NULL) {
dsl_free(tx->tx_pool, tx->tx_txg, bp);
@@ -270,7 +278,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+ BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@@ -278,7 +286,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
bplist_append(&ds->ds_dir->dd_pending_frees, bp);
}
- if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object);
@@ -310,16 +318,16 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
ASSERT3U(ds->ds_prev->ds_object, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj);
ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
- /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+ /* if (logical birth > prev prev snap txg) prev unique += bs */
if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
- ds->ds_object && bp->blk_birth >
+ ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
mutex_enter(&ds->ds_prev->ds_lock);
dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
- if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+ if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
dsl_dir_transfer_space(ds->ds_dir, used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
@@ -524,7 +532,7 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
matchtype_t mt = 0;
int err;
- dsl_dir_snap_cmtime_update(ds->ds_dir);
+ dsl_dir_snap_cmtime_update(ds->ds_dir, tx);
if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
mt = MT_NORMALIZE;
@@ -541,7 +549,7 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
}
boolean_t
-dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, const void *tag)
{
dmu_buf_t *dbuf = ds->ds_dbuf;
boolean_t result = B_FALSE;
@@ -559,7 +567,7 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
}
int
-dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag,
dsl_dataset_t **dsp)
{
objset_t *mos = dp->dp_meta_objset;
@@ -633,6 +641,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_phys(ds)->ds_prev_snap_obj,
ds, &ds->ds_prev);
}
+ if (err != 0)
+ goto after_dsl_bookmark_fini;
err = dsl_bookmark_init_ds(ds);
} else {
if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
@@ -681,11 +691,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
if (err != 0 || winner != NULL) {
- bplist_destroy(&ds->ds_pending_deadlist);
dsl_deadlist_close(&ds->ds_deadlist);
if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
dsl_deadlist_close(&ds->ds_remap_deadlist);
dsl_bookmark_fini_ds(ds);
+after_dsl_bookmark_fini:
if (ds->ds_prev)
dsl_dataset_rele(ds->ds_prev, ds);
dsl_dir_rele(ds->ds_dir, ds);
@@ -696,6 +706,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
list_destroy(&ds->ds_prop_cbs);
list_destroy(&ds->ds_sendstreams);
+ bplist_destroy(&ds->ds_pending_deadlist);
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_sendstream_lock);
@@ -748,7 +759,7 @@ dsl_dataset_create_key_mapping(dsl_dataset_t *ds)
int
dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
- ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+ ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)
{
int err;
@@ -769,7 +780,7 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
int
dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
- void *tag, dsl_dataset_t **dsp)
+ const void *tag, dsl_dataset_t **dsp)
{
dsl_dir_t *dd;
const char *snapname;
@@ -822,7 +833,7 @@ dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
}
int
-dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
+dsl_dataset_hold(dsl_pool_t *dp, const char *name, const void *tag,
dsl_dataset_t **dsp)
{
return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
@@ -830,7 +841,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
static int
dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
- void *tag, boolean_t override, dsl_dataset_t **dsp)
+ const void *tag, boolean_t override, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
if (err != 0)
@@ -846,21 +857,21 @@ dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
int
dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
- void *tag, dsl_dataset_t **dsp)
+ const void *tag, dsl_dataset_t **dsp)
{
return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp));
}
int
dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj,
- ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+ ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp)
{
return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp));
}
static int
dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
- void *tag, boolean_t override, dsl_dataset_t **dsp)
+ const void *tag, boolean_t override, dsl_dataset_t **dsp)
{
int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
if (err != 0)
@@ -874,14 +885,14 @@ dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
int
dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
- void *tag, dsl_dataset_t **dsp)
+ const void *tag, dsl_dataset_t **dsp)
{
return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp));
}
int
dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
- void *tag, dsl_dataset_t **dsp)
+ const void *tag, dsl_dataset_t **dsp)
{
return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp));
}
@@ -896,14 +907,14 @@ dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
* and accessed.
*/
void
-dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+dsl_dataset_long_hold(dsl_dataset_t *ds, const void *tag)
{
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
(void) zfs_refcount_add(&ds->ds_longholds, tag);
}
void
-dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+dsl_dataset_long_rele(dsl_dataset_t *ds, const void *tag)
{
(void) zfs_refcount_remove(&ds->ds_longholds, tag);
}
@@ -960,7 +971,7 @@ dsl_dataset_namelen(dsl_dataset_t *ds)
}
void
-dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+dsl_dataset_rele(dsl_dataset_t *ds, const void *tag)
{
dmu_buf_rele(ds->ds_dbuf, tag);
}
@@ -978,7 +989,8 @@ dsl_dataset_remove_key_mapping(dsl_dataset_t *ds)
}
void
-dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags,
+ const void *tag)
{
if (flags & DS_HOLD_FLAG_DECRYPT)
dsl_dataset_remove_key_mapping(ds);
@@ -987,7 +999,7 @@ dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
}
void
-dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, const void *tag)
{
ASSERT3P(ds->ds_owner, ==, tag);
ASSERT(ds->ds_dbuf != NULL);
@@ -1000,7 +1012,7 @@ dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
}
boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override)
+dsl_dataset_tryown(dsl_dataset_t *ds, const void *tag, boolean_t override)
{
boolean_t gotit = FALSE;
@@ -1150,7 +1162,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
- bzero(dsphys, sizeof (dsl_dataset_phys_t));
+ memset(dsphys, 0, sizeof (dsl_dataset_phys_t));
dsphys->ds_dir_obj = dd->dd_object;
dsphys->ds_flags = flags;
dsphys->ds_fsid_guid = unique_create();
@@ -1250,20 +1262,17 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
objset_t *os;
VERIFY0(dmu_objset_from_ds(ds, &os));
- if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+ if (memcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
dsl_pool_t *dp = ds->ds_dir->dd_pool;
zio_t *zio;
- bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+ memset(&os->os_zil_header, 0, sizeof (os->os_zil_header));
if (os->os_encrypted)
os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dsl_dataset_sync(ds, zio, tx);
VERIFY0(zio_wait(zio));
-
- /* dsl_dataset_sync_done will drop this reference. */
- dmu_buf_add_ref(ds->ds_dbuf, ds);
dsl_dataset_sync_done(ds, tx);
}
}
@@ -1612,7 +1621,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
for (pair = nvlist_next_nvpair(cnt_track, NULL);
pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
int error = 0;
- char *name;
+ const char *name;
uint64_t cnt = 0;
dsl_dataset_t *ds;
@@ -1644,7 +1653,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
int error = 0;
dsl_dataset_t *ds;
- char *name, *atp = NULL;
+ const char *name, *atp = NULL;
char dsname[ZFS_MAX_DATASET_NAME_LEN];
name = nvpair_name(pair);
@@ -1687,7 +1696,6 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsl_dataset_phys_t *dsphys;
uint64_t dsobj, crtxg;
objset_t *mos = dp->dp_meta_objset;
- static zil_header_t zero_zil __maybe_unused;
objset_t *os __maybe_unused;
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
@@ -1698,7 +1706,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
*/
ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
dmu_objset_from_ds(ds, &os) != 0 ||
- bcmp(&os->os_phys->os_zil_header, &zero_zil,
+ memcmp(&os->os_phys->os_zil_header, &zero_zil,
sizeof (zero_zil)) == 0);
/* Should not snapshot a dirty dataset. */
@@ -1720,7 +1728,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
- bzero(dsphys, sizeof (dsl_dataset_phys_t));
+ memset(dsphys, 0, sizeof (dsl_dataset_phys_t));
dsphys->ds_dir_obj = ds->ds_dir->dd_object;
dsphys->ds_fsid_guid = unique_create();
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
@@ -1854,9 +1862,10 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dsl_scan_ds_snapshotted(ds, tx);
- dsl_dir_snap_cmtime_update(ds->ds_dir);
+ dsl_dir_snap_cmtime_update(ds->ds_dir, tx);
- spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");
+ if (zfs_snapshot_history_enabled)
+ spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");
}
void
@@ -1869,7 +1878,7 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
dsl_dataset_t *ds;
- char *name, *atp;
+ const char *name, *atp;
char dsname[ZFS_MAX_DATASET_NAME_LEN];
name = nvpair_name(pair);
@@ -1898,7 +1907,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
boolean_t needsuspend;
int error;
spa_t *spa;
- char *firstname;
+ const char *firstname;
nvlist_t *suspended = NULL;
pair = nvlist_next_nvpair(snaps, NULL);
@@ -1917,8 +1926,8 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
char fsname[ZFS_MAX_DATASET_NAME_LEN];
- char *snapname = nvpair_name(pair);
- char *atp;
+ const char *snapname = nvpair_name(pair);
+ const char *atp;
void *cookie;
atp = strchr(snapname, '@');
@@ -2061,8 +2070,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
return (error);
}
+/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */
void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(ds->ds_objset != NULL);
@@ -2090,17 +2100,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
}
- dmu_objset_sync(ds->ds_objset, zio, tx);
-
- for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
- if (zfeature_active(f, ds->ds_feature_activation[f])) {
- if (zfeature_active(f, ds->ds_feature[f]))
- continue;
- dsl_dataset_activate_feature(ds->ds_object, f,
- ds->ds_feature_activation[f], tx);
- ds->ds_feature[f] = ds->ds_feature_activation[f];
- }
- }
+ dmu_objset_sync(ds->ds_objset, rio, tx);
}
/*
@@ -2116,8 +2116,6 @@ dsl_livelist_should_disable(dsl_dataset_t *ds)
used = dsl_dir_get_usedds(ds->ds_dir);
referenced = dsl_get_referenced(ds);
- ASSERT3U(referenced, >=, 0);
- ASSERT3U(used, >=, 0);
if (referenced == 0)
return (B_FALSE);
percent_shared = (100 * (referenced - used)) / referenced;
@@ -2272,9 +2270,18 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
else
ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]);
- ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (zfeature_active(f,
+ ds->ds_feature_activation[f])) {
+ if (zfeature_active(f, ds->ds_feature[f]))
+ continue;
+ dsl_dataset_activate_feature(ds->ds_object, f,
+ ds->ds_feature_activation[f], tx);
+ ds->ds_feature[f] = ds->ds_feature_activation[f];
+ }
+ }
- dmu_buf_rele(ds->ds_dbuf, ds);
+ ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
}
int
@@ -2331,161 +2338,147 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
nvlist_free(propval);
}
-/*
- * Returns a string that represents the receive resume stats token. It should
- * be freed with strfree().
- */
-char *
-get_receive_resume_stats_impl(dsl_dataset_t *ds)
+static char *
+get_receive_resume_token_impl(dsl_dataset_t *ds)
{
+ if (!dsl_dataset_has_resume_receive_state(ds))
+ return (NULL);
+
dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ char *str;
+ void *packed;
+ uint8_t *compressed;
+ uint64_t val;
+ nvlist_t *token_nv = fnvlist_alloc();
+ size_t packed_size, compressed_size;
+
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "fromguid", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "object", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "offset", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "bytes", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "toguid", val);
+ }
+ char buf[MAXNAMELEN];
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+ fnvlist_add_string(token_nv, "toname", buf);
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_LARGEBLOCK) == 0) {
+ fnvlist_add_boolean(token_nv, "largeblockok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_EMBEDOK) == 0) {
+ fnvlist_add_boolean(token_nv, "embedok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_COMPRESSOK) == 0) {
+ fnvlist_add_boolean(token_nv, "compressok");
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_RAWOK) == 0) {
+ fnvlist_add_boolean(token_nv, "rawok");
+ }
+ if (dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_REDACTED_DATASETS)) {
+ uint64_t num_redact_snaps = 0;
+ uint64_t *redact_snaps = NULL;
+ VERIFY3B(dsl_dataset_get_uint64_array_feature(ds,
+ SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
+ &redact_snaps), ==, B_TRUE);
+ fnvlist_add_uint64_array(token_nv, "redact_snaps",
+ redact_snaps, num_redact_snaps);
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
+ uint64_t num_redact_snaps = 0, int_size = 0;
+ uint64_t *redact_snaps = NULL;
+ VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
+ &num_redact_snaps));
+ ASSERT3U(int_size, ==, sizeof (uint64_t));
- if (dsl_dataset_has_resume_receive_state(ds)) {
- char *str;
- void *packed;
- uint8_t *compressed;
- uint64_t val;
- nvlist_t *token_nv = fnvlist_alloc();
- size_t packed_size, compressed_size;
-
- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
- fnvlist_add_uint64(token_nv, "fromguid", val);
- }
- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
- fnvlist_add_uint64(token_nv, "object", val);
- }
- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
- fnvlist_add_uint64(token_nv, "offset", val);
- }
- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
- fnvlist_add_uint64(token_nv, "bytes", val);
- }
- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
- fnvlist_add_uint64(token_nv, "toguid", val);
- }
- char buf[MAXNAMELEN];
- if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
- fnvlist_add_string(token_nv, "toname", buf);
- }
- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_LARGEBLOCK) == 0) {
- fnvlist_add_boolean(token_nv, "largeblockok");
- }
- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_EMBEDOK) == 0) {
- fnvlist_add_boolean(token_nv, "embedok");
- }
- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_COMPRESSOK) == 0) {
- fnvlist_add_boolean(token_nv, "compressok");
- }
- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_RAWOK) == 0) {
- fnvlist_add_boolean(token_nv, "rawok");
- }
- if (dsl_dataset_feature_is_active(ds,
- SPA_FEATURE_REDACTED_DATASETS)) {
- uint64_t num_redact_snaps;
- uint64_t *redact_snaps;
- VERIFY(dsl_dataset_get_uint64_array_feature(ds,
- SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
- &redact_snaps));
- fnvlist_add_uint64_array(token_nv, "redact_snaps",
- redact_snaps, num_redact_snaps);
- }
- if (zap_contains(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
- uint64_t num_redact_snaps, int_size;
- uint64_t *redact_snaps;
- VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
- &num_redact_snaps));
- ASSERT3U(int_size, ==, sizeof (uint64_t));
-
- redact_snaps = kmem_alloc(int_size * num_redact_snaps,
- KM_SLEEP);
- VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
- DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
- num_redact_snaps, redact_snaps));
- fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
- redact_snaps, num_redact_snaps);
- kmem_free(redact_snaps, int_size * num_redact_snaps);
- }
- packed = fnvlist_pack(token_nv, &packed_size);
- fnvlist_free(token_nv);
- compressed = kmem_alloc(packed_size, KM_SLEEP);
-
- compressed_size = gzip_compress(packed, compressed,
- packed_size, packed_size, 6);
-
- zio_cksum_t cksum;
- fletcher_4_native_varsize(compressed, compressed_size, &cksum);
-
- size_t alloc_size = compressed_size * 2 + 1;
- str = kmem_alloc(alloc_size, KM_SLEEP);
- for (int i = 0; i < compressed_size; i++) {
- size_t offset = i * 2;
- (void) snprintf(str + offset, alloc_size - offset,
- "%02x", compressed[i]);
- }
- str[compressed_size * 2] = '\0';
- char *propval = kmem_asprintf("%u-%llx-%llx-%s",
- ZFS_SEND_RESUME_TOKEN_VERSION,
- (longlong_t)cksum.zc_word[0],
- (longlong_t)packed_size, str);
- kmem_free(packed, packed_size);
- kmem_free(str, alloc_size);
- kmem_free(compressed, packed_size);
- return (propval);
- }
- return (kmem_strdup(""));
+ redact_snaps = kmem_alloc(int_size * num_redact_snaps,
+ KM_SLEEP);
+ VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
+ num_redact_snaps, redact_snaps));
+ fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
+ redact_snaps, num_redact_snaps);
+ kmem_free(redact_snaps, int_size * num_redact_snaps);
+ }
+ packed = fnvlist_pack(token_nv, &packed_size);
+ fnvlist_free(token_nv);
+ compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+ compressed_size = gzip_compress(packed, compressed,
+ packed_size, packed_size, 6);
+
+ zio_cksum_t cksum;
+ fletcher_4_native_varsize(compressed, compressed_size, &cksum);
+
+ size_t alloc_size = compressed_size * 2 + 1;
+ str = kmem_alloc(alloc_size, KM_SLEEP);
+ for (int i = 0; i < compressed_size; i++) {
+ size_t offset = i * 2;
+ (void) snprintf(str + offset, alloc_size - offset,
+ "%02x", compressed[i]);
+ }
+ str[compressed_size * 2] = '\0';
+ char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+ ZFS_SEND_RESUME_TOKEN_VERSION,
+ (longlong_t)cksum.zc_word[0],
+ (longlong_t)packed_size, str);
+ kmem_free(packed, packed_size);
+ kmem_free(str, alloc_size);
+ kmem_free(compressed, packed_size);
+ return (propval);
}
/*
- * Returns a string that represents the receive resume stats token of the
- * dataset's child. It should be freed with strfree().
+ * Returns a string that represents the receive resume state token. It should
+ * be freed with strfree(). NULL is returned if no resume state is present.
*/
char *
-get_child_receive_stats(dsl_dataset_t *ds)
+get_receive_resume_token(dsl_dataset_t *ds)
{
- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ /*
+ * A failed "newfs" (e.g. full) resumable receive leaves
+ * the stats set on this dataset. Check here for the prop.
+ */
+ char *token = get_receive_resume_token_impl(ds);
+ if (token != NULL)
+ return (token);
+ /*
+ * A failed incremental resumable receive leaves the
+ * stats set on our child named "%recv". Check the child
+ * for the prop.
+ */
+ /* 6 extra bytes for /%recv */
+ char name[ZFS_MAX_DATASET_NAME_LEN + 6];
dsl_dataset_t *recv_ds;
- dsl_dataset_name(ds, recvname);
- if (strlcat(recvname, "/", sizeof (recvname)) <
- sizeof (recvname) &&
- strlcat(recvname, recv_clone_name, sizeof (recvname)) <
- sizeof (recvname) &&
- dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
- &recv_ds) == 0) {
- char *propval = get_receive_resume_stats_impl(recv_ds);
+ dsl_dataset_name(ds, name);
+ if (strlcat(name, "/", sizeof (name)) < sizeof (name) &&
+ strlcat(name, recv_clone_name, sizeof (name)) < sizeof (name) &&
+ dsl_dataset_hold(ds->ds_dir->dd_pool, name, FTAG, &recv_ds) == 0) {
+ token = get_receive_resume_token_impl(recv_ds);
dsl_dataset_rele(recv_ds, FTAG);
- return (propval);
}
- return (kmem_strdup(""));
-}
-
-static void
-get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
-{
- char *propval = get_receive_resume_stats_impl(ds);
- if (strcmp(propval, "") != 0) {
- dsl_prop_nvlist_add_string(nv,
- ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
- } else {
- char *childval = get_child_receive_stats(ds);
- if (strcmp(childval, "") != 0) {
- dsl_prop_nvlist_add_string(nv,
- ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
- }
- kmem_strfree(childval);
- }
- kmem_strfree(propval);
+ return (token);
}
uint64_t
@@ -2744,6 +2737,8 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
relpath[0] != '\0'))
mnt = value + 1;
+ mnt = kmem_strdup(mnt);
+
if (relpath[0] == '\0') {
(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
root, mnt);
@@ -2753,6 +2748,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
relpath);
}
kmem_free(buf, ZAP_MAXVALUELEN);
+ kmem_strfree(mnt);
}
return (0);
@@ -2761,7 +2757,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
ASSERT(dsl_pool_config_held(dp));
@@ -2812,6 +2808,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
dsl_get_userrefs(ds));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
dsl_get_defer_destroy(ds));
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOTS_CHANGED,
+ dsl_dir_snap_cmtime(ds->ds_dir).tv_sec);
dsl_dataset_crypt_stats(ds, nv);
if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
@@ -2823,28 +2821,11 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
}
if (!dsl_dataset_is_snapshot(ds)) {
- /*
- * A failed "newfs" (e.g. full) resumable receive leaves
- * the stats set on this dataset. Check here for the prop.
- */
- get_receive_resume_stats(ds, nv);
-
- /*
- * A failed incremental resumable receive leaves the
- * stats set on our child named "%recv". Check the child
- * for the prop.
- */
- /* 6 extra bytes for /%recv */
- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
- dsl_dataset_t *recv_ds;
- dsl_dataset_name(ds, recvname);
- if (strlcat(recvname, "/", sizeof (recvname)) <
- sizeof (recvname) &&
- strlcat(recvname, recv_clone_name, sizeof (recvname)) <
- sizeof (recvname) &&
- dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
- get_receive_resume_stats(recv_ds, nv);
- dsl_dataset_rele(recv_ds, FTAG);
+ char *token = get_receive_resume_token(ds);
+ if (token != NULL) {
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN, token);
+ kmem_strfree(token);
}
}
}
@@ -2915,7 +2896,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
if (snap == NULL)
return (B_FALSE);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+ birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
rrw_exit(&ds->ds_bp_rwlock, FTAG);
if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
objset_t *os, *os_snap;
@@ -2928,26 +2909,18 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
return (B_TRUE);
if (dmu_objset_from_ds(snap, &os_snap) != 0)
return (B_TRUE);
- return (bcmp(&os->os_phys->os_meta_dnode,
+ return (memcmp(&os->os_phys->os_meta_dnode,
&os_snap->os_phys->os_meta_dnode,
sizeof (os->os_phys->os_meta_dnode)) != 0);
}
return (B_FALSE);
}
-typedef struct dsl_dataset_rename_snapshot_arg {
- const char *ddrsa_fsname;
- const char *ddrsa_oldsnapname;
- const char *ddrsa_newsnapname;
- boolean_t ddrsa_recursive;
- dmu_tx_t *ddrsa_tx;
-} dsl_dataset_rename_snapshot_arg_t;
-
-/* ARGSUSED */
static int
dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
dsl_dataset_t *hds, void *arg)
{
+ (void) dp;
dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
int error;
uint64_t val;
@@ -2973,7 +2946,7 @@ dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
return (error);
}
-static int
+int
dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
{
dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
@@ -3035,7 +3008,7 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
return (0);
}
-static void
+void
dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
{
dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
@@ -3299,8 +3272,8 @@ struct promotenode {
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
- void *tag);
-static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
+ const void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag);
int
dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
@@ -3309,7 +3282,6 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *hds;
struct promotenode *snap;
- dsl_dataset_t *origin_ds, *origin_head;
int err;
uint64_t unused;
uint64_t ss_mv_cnt;
@@ -3329,12 +3301,11 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
}
snap = list_head(&ddpa->shared_snaps);
- origin_head = snap->ds;
if (snap == NULL) {
err = SET_ERROR(ENOENT);
goto out;
}
- origin_ds = snap->ds;
+ dsl_dataset_t *const origin_ds = snap->ds;
/*
* Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
@@ -3430,10 +3401,10 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
* Check that bookmarks that are being transferred don't have
* name conflicts.
*/
- for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+ for (dsl_bookmark_node_t *dbn = avl_first(&origin_ds->ds_bookmarks);
dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
dsl_dataset_phys(origin_ds)->ds_creation_txg;
- dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) {
+ dbn = AVL_NEXT(&origin_ds->ds_bookmarks, dbn)) {
if (strlen(dbn->dbn_name) >= max_snap_len) {
err = SET_ERROR(ENAMETOOLONG);
goto out;
@@ -3447,7 +3418,8 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
conflicting_snaps = B_TRUE;
} else if (err == ESRCH) {
err = 0;
- } else if (err != 0) {
+ }
+ if (err != 0) {
goto out;
}
}
@@ -3741,6 +3713,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
dsl_dir_rele(odd, FTAG);
promote_rele(ddpa, FTAG);
+
+ /*
+ * Transfer common error blocks from old head to new head.
+ */
+ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ uint64_t old_head = origin_head->ds_object;
+ uint64_t new_head = hds->ds_object;
+ spa_swap_errlog(dp->dp_spa, new_head, old_head, tx);
+ }
}
/*
@@ -3751,7 +3732,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
*/
static int
snaplist_make(dsl_pool_t *dp,
- uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
+ uint64_t first_obj, uint64_t last_obj, list_t *l, const void *tag)
{
uint64_t obj = last_obj;
@@ -3796,15 +3777,14 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
}
static void
-snaplist_destroy(list_t *l, void *tag)
+snaplist_destroy(list_t *l, const void *tag)
{
struct promotenode *snap;
if (l == NULL || !list_link_active(&l->list_head))
return;
- while ((snap = list_tail(l)) != NULL) {
- list_remove(l, snap);
+ while ((snap = list_remove_tail(l)) != NULL) {
dsl_dataset_rele(snap->ds, tag);
kmem_free(snap, sizeof (*snap));
}
@@ -3812,7 +3792,7 @@ snaplist_destroy(list_t *l, void *tag)
}
static int
-promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, const void *tag)
{
int error;
dsl_dir_t *dd;
@@ -3862,7 +3842,7 @@ out:
}
static void
-promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
+promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag)
{
snaplist_destroy(&ddpa->shared_snaps, tag);
snaplist_destroy(&ddpa->clone_snaps, tag);
@@ -4305,7 +4285,6 @@ typedef struct dsl_dataset_set_qr_arg {
} dsl_dataset_set_qr_arg_t;
-/* ARGSUSED */
static int
dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
{
@@ -4512,7 +4491,6 @@ typedef struct dsl_dataset_set_compression_arg {
uint64_t ddsca_value;
} dsl_dataset_set_compression_arg_t;
-/* ARGSUSED */
static int
dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx)
{
@@ -4540,6 +4518,7 @@ dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx)
uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
spa_feature_t f = zio_compress_to_feature(compval);
+ ASSERT3S(f, !=, SPA_FEATURE_NONE);
ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN);
VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds));
@@ -4951,7 +4930,7 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
if (num_redact_snaps > 0) {
ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t),
KM_SLEEP);
- bcopy(redact_snaps, ftuaa->array, num_redact_snaps *
+ memcpy(ftuaa->array, redact_snaps, num_redact_snaps *
sizeof (uint64_t));
}
dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS,
@@ -4959,19 +4938,45 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;
}
-/* BEGIN CSTYLED */
-#if defined(_LP64)
-#define RECORDSIZE_PERM ZMOD_RW
-#else
-/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
-#define RECORDSIZE_PERM ZMOD_RD
-#endif
-ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM,
+/*
+ * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj
+ * dataset whose birth time is >= min_txg.
+ */
+int
+dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg,
+ uint64_t *oldest_dsobj)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ while (prev_obj != 0 && min_txg < prev_obj_txg) {
+ dsl_dataset_rele(ds, FTAG);
+ if ((error = dsl_dataset_hold_obj(dp, prev_obj,
+ FTAG, &ds)) != 0)
+ return (error);
+ prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+ *oldest_dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, UINT, ZMOD_RW,
"Max allowed record size");
ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW,
"Allow mounting of redacted datasets");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs, zfs_, snapshot_history_enabled, INT, ZMOD_RW,
+ "Include snapshot events in pool history/events");
EXPORT_SYMBOL(dsl_dataset_hold);
EXPORT_SYMBOL(dsl_dataset_hold_flags);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
index a77e381520db..eff1f7de7731 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -92,7 +92,7 @@
* will be loaded into memory and shouldn't take up an inordinate amount of
* space. We settled on ~500000 entries, corresponding to roughly 128M.
*/
-unsigned long zfs_livelist_max_entries = 500000;
+uint64_t zfs_livelist_max_entries = 500000;
/*
* We can approximate how much of a performance gain a livelist will give us
@@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
* in parallel. Then open them all in a second pass.
*/
dle->dle_bpobj.bpo_object = za.za_first_integer;
- dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
- 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+ dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
+ ZIO_PRIORITY_SYNC_READ);
avl_add(&dl->dl_tree, dle);
}
@@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
* in parallel. Then open them all in a second pass.
*/
dlce->dlce_bpobj = za.za_first_integer;
- dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
- 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+ dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
+ ZIO_PRIORITY_SYNC_READ);
avl_add(&dl->dl_cache, dlce);
}
VERIFY3U(error, ==, ENOENT);
@@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
}
}
+/*
+ * Prefetch metadata required for dle_enqueue_subobj().
+ */
+static void
+dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+ uint64_t obj)
+{
+ if (dle->dle_bpobj.bpo_object !=
+ dmu_objset_pool(dl->dl_os)->dp_empty_bpobj)
+ bpobj_prefetch_subobj(&dle->dle_bpobj, obj);
+}
+
void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
@@ -462,7 +474,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
- dle_tofind.dle_mintxg = bp->blk_birth;
+ dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
@@ -471,7 +483,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
if (dle == NULL) {
zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
- bp, (longlong_t)bp->blk_birth);
+ bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
dle = avl_first(&dl->dl_tree);
}
@@ -542,6 +554,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
ASSERT3P(dle, !=, NULL);
dle_prev = AVL_PREV(&dl->dl_tree, dle);
+ ASSERT3P(dle_prev, !=, NULL);
dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
@@ -809,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
dle_enqueue_subobj(dl, dle, obj, tx);
}
+/*
+ * Prefetch metadata required for dsl_deadlist_insert_bpobj().
+ */
+static void
+dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ dle_prefetch_subobj(dl, dle, obj);
+}
+
static int
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
@@ -825,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
void
dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
{
- zap_cursor_t zc;
- zap_attribute_t za;
+ zap_cursor_t zc, pzc;
+ zap_attribute_t *za, *pza;
dmu_buf_t *bonus;
dsl_deadlist_phys_t *dlp;
dmu_object_info_t doi;
- int error;
+ int error, perror, i;
VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
if (doi.doi_type == DMU_OT_BPOBJ) {
@@ -841,23 +875,46 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
return;
}
+ za = kmem_alloc(sizeof (*za), KM_SLEEP);
+ pza = kmem_alloc(sizeof (*pza), KM_SLEEP);
+
mutex_enter(&dl->dl_lock);
+ /*
+ * Prefetch up to 128 deadlists first and then more as we progress.
+ * The limit is a balance between ARC use and diminishing returns.
+ */
+ for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0;
+ (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128;
+ zap_cursor_advance(&pzc), i++) {
+ dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
+ zfs_strtonum(pza->za_name, NULL));
+ }
for (zap_cursor_init(&zc, dl->dl_os, obj);
- (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ (error = zap_cursor_retrieve(&zc, za)) == 0;
zap_cursor_advance(&zc)) {
- uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
- dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
- VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+ dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
+ zfs_strtonum(za->za_name, NULL), tx);
+ VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
+ if (perror == 0) {
+ dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
+ zfs_strtonum(pza->za_name, NULL));
+ zap_cursor_advance(&pzc);
+ perror = zap_cursor_retrieve(&pzc, pza);
+ }
}
VERIFY3U(error, ==, ENOENT);
zap_cursor_fini(&zc);
+ zap_cursor_fini(&pzc);
VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
dlp = bonus->db_data;
dmu_buf_will_dirty(bonus, tx);
- bzero(dlp, sizeof (*dlp));
+ memset(dlp, 0, sizeof (*dlp));
dmu_buf_rele(bonus, FTAG);
mutex_exit(&dl->dl_lock);
+
+ kmem_free(za, sizeof (*za));
+ kmem_free(pza, sizeof (*pza));
}
/*
@@ -868,8 +925,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
- dsl_deadlist_entry_t *dle;
+ dsl_deadlist_entry_t *dle, *pdle;
avl_index_t where;
+ int i;
ASSERT(!dl->dl_oldfmt);
@@ -881,11 +939,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+ /*
+ * Prefetch up to 128 deadlists first and then more as we progress.
+ * The limit is a balance between ARC use and diminishing returns.
+ */
+ for (pdle = dle, i = 0; pdle && i < 128; i++) {
+ bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
+ pdle = AVL_NEXT(&dl->dl_tree, pdle);
+ }
while (dle) {
uint64_t used, comp, uncomp;
dsl_deadlist_entry_t *dle_next;
bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+ if (pdle) {
+ bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object);
+ pdle = AVL_NEXT(&dl->dl_tree, pdle);
+ }
VERIFY0(bpobj_space(&dle->dle_bpobj,
&used, &comp, &uncomp));
@@ -930,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg)
/* if vdevs are equal, sort by offsets. */
uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
- if (l_dva0_offset == r_dva0_offset)
- ASSERT3U(l->blk_birth, ==, r->blk_birth);
return (TREE_CMP(l_dva0_offset, r_dva0_offset));
}
@@ -946,9 +1014,9 @@ struct livelist_iter_arg {
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
* corresponding FREE are stored in the supplied bplist.
*
- * Note that multiple FREE and ALLOC entries for the same blkptr may
- * be encountered when dedup is involved. For this reason we keep a
- * refcount for all the FREE entries of each blkptr and ensure that
+ * Note that multiple FREE and ALLOC entries for the same blkptr may be
+ * encountered when dedup or block cloning is involved. For this reason we
+ * keep a refcount for all the FREE entries of each blkptr and ensure that
* each of those FREE entries has a corresponding ALLOC preceding it.
*/
static int
@@ -967,6 +1035,12 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
livelist_entry_t node;
node.le_bp = *bp;
livelist_entry_t *found = avl_find(avl, &node, NULL);
+ if (found) {
+ ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==,
+ BP_GET_CHECKSUM(&found->le_bp));
+ ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp));
+ }
if (bp_freed) {
if (found == NULL) {
/* first free entry for this blkptr */
@@ -976,10 +1050,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
e->le_refcnt = 1;
avl_add(avl, e);
} else {
- /* dedup block free */
- ASSERT(BP_GET_DEDUP(bp));
- ASSERT3U(BP_GET_CHECKSUM(bp), ==,
- BP_GET_CHECKSUM(&found->le_bp));
+ /*
+ * Deduped or cloned block free. We could assert D bit
+ * for dedup, but there is no such one for cloning.
+ */
ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt);
found->le_refcnt++;
}
@@ -995,14 +1069,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
/* all tracked free pairs have been matched */
avl_remove(avl, found);
kmem_free(found, sizeof (livelist_entry_t));
- } else {
- /*
- * This is definitely a deduped blkptr so
- * let's validate it.
- */
- ASSERT(BP_GET_DEDUP(bp));
- ASSERT3U(BP_GET_CHECKSUM(bp), ==,
- BP_GET_CHECKSUM(&found->le_bp));
}
}
}
@@ -1028,16 +1094,19 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
.t = t
};
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
+ VERIFY(err != 0 || avl_numnodes(&avl) == 0);
- VERIFY0(avl_numnodes(&avl));
+ void *cookie = NULL;
+ livelist_entry_t *le = NULL;
+ while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) {
+ kmem_free(le, sizeof (livelist_entry_t));
+ }
avl_destroy(&avl);
return (err);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW,
"Size to start the next sub-livelist in a livelist");
ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
"Threshold at which livelist is disabled");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deleg.c b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
index cf8a3c9bbdfb..645ad8e5b8dc 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_deleg.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
index a2748197f29d..d4a6e5b6e9fd 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -49,6 +49,8 @@
#include <sys/zthr.h>
#include <sys/spa_impl.h>
+extern int zfs_snapshot_history_enabled;
+
int
dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
{
@@ -130,10 +132,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
- if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+ if (BP_GET_LOGICAL_BIRTH(bp) <=
+ dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
if (poa->ds_prev && !poa->after_branch_point &&
- bp->blk_birth >
+ BP_GET_LOGICAL_BIRTH(bp) >
dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
bp_get_dsize_sync(dp->dp_spa, bp);
@@ -200,7 +203,7 @@ rck_alloc(dsl_dataset_t *clone)
static void
dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx,
- list_t *stack, void *tag)
+ list_t *stack, const void *tag)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
@@ -311,7 +314,8 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+ tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
@@ -321,14 +325,19 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
- spa_history_log_internal_ds(ds, "defer_destroy", tx, " ");
+ if (zfs_snapshot_history_enabled) {
+ spa_history_log_internal_ds(ds, "defer_destroy", tx,
+ " ");
+ }
return;
}
ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
- /* We need to log before removing it from the namespace. */
- spa_history_log_internal_ds(ds, "destroy", tx, " ");
+ if (zfs_snapshot_history_enabled) {
+ /* We need to log before removing it from the namespace. */
+ spa_history_log_internal_ds(ds, "destroy", tx, " ");
+ }
dsl_scan_ds_destroyed(ds, tx);
@@ -651,7 +660,7 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
zfs_lua_max_memlimit,
fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result);
if (error != 0) {
- char *errorstr = NULL;
+ const char *errorstr = NULL;
(void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
if (errorstr != NULL) {
zfs_dbgmsg("%s", errorstr);
@@ -699,11 +708,11 @@ struct killarg {
dmu_tx_t *tx;
};
-/* ARGSUSED */
static int
kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
+ (void) spa, (void) dnp;
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
@@ -720,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
ASSERT(zilog == NULL);
- ASSERT3U(bp->blk_birth, >,
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
@@ -1010,7 +1019,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT(ds->ds_prev == NULL ||
dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+ tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
@@ -1118,6 +1128,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) !=
NULL) {
if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+ dnode_t *rl;
+ VERIFY0(dnode_hold(mos,
+ dbn->dbn_phys.zbm_redaction_obj, FTAG,
+ &rl));
+ if (rl->dn_have_spill) {
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_REDACTION_LIST_SPILL,
+ tx);
+ }
+ dnode_rele(rl, FTAG);
VERIFY0(dmu_object_free(mos,
dbn->dbn_phys.zbm_redaction_obj, tx));
spa_feature_decr(dmu_objset_spa(mos),
@@ -1153,6 +1173,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
dsl_dataset_rele(prev, FTAG);
}
+ /* Delete errlog. */
+ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG))
+ spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx);
}
void
@@ -1246,10 +1269,10 @@ dsl_destroy_head(const char *name)
* inconsistent datasets, even if we encounter an error trying to
* process one of them.
*/
-/* ARGSUSED */
int
dsl_destroy_inconsistent(const char *dsname, void *arg)
{
+ (void) arg;
objset_t *os;
if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c
index 84caace4dbab..baf970121a61 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_dir.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -26,6 +26,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
*/
#include <sys/dmu.h>
@@ -54,6 +55,15 @@
#include "zfs_prop.h"
/*
+ * This controls if we verify the ZVOL quota or not.
+ * Currently, quotas are not implemented for ZVOLs.
+ * The quota size is the size of the ZVOL.
+ * The size of the volume already implies the ZVOL size quota.
+ * The quota mechanism can introduce a significant performance drop.
+ */
+static int zvol_enforce_quotas = B_TRUE;
+
+/*
* Filesystem and Snapshot Limits
* ------------------------------
*
@@ -121,8 +131,6 @@
* dsl_dir_init_fs_ss_count().
*/
-extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
-
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
typedef struct ddulrt_arg {
@@ -162,7 +170,7 @@ dsl_dir_evict_async(void *dbu)
int
dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
- const char *tail, void *tag, dsl_dir_t **ddp)
+ const char *tail, const void *tag, dsl_dir_t **ddp)
{
dmu_buf_t *dbuf;
dsl_dir_t *dd;
@@ -209,8 +217,6 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
}
}
- dsl_dir_snap_cmtime_update(dd);
-
if (dsl_dir_phys(dd)->dd_parent_obj) {
err = dsl_dir_hold_obj(dp,
dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
@@ -272,6 +278,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
}
}
+ if (dsl_dir_is_zapified(dd)) {
+ inode_timespec_t t = {0};
+ (void) zap_lookup(dp->dp_meta_objset, ddobj,
+ DD_FIELD_SNAPSHOTS_CHANGED,
+ sizeof (uint64_t),
+ sizeof (inode_timespec_t) / sizeof (uint64_t),
+ &t);
+ dd->dd_snap_cmtime = t;
+ }
+
dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
&dd->dd_dbuf);
winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
@@ -322,7 +338,7 @@ errout:
}
void
-dsl_dir_rele(dsl_dir_t *dd, void *tag)
+dsl_dir_rele(dsl_dir_t *dd, const void *tag)
{
dprintf_dd(dd, "%s\n", "");
spa_close(dd->dd_pool->dp_spa, tag);
@@ -337,7 +353,7 @@ dsl_dir_rele(dsl_dir_t *dd, void *tag)
* the spa.
*/
void
-dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+dsl_dir_async_rele(dsl_dir_t *dd, const void *tag)
{
dprintf_dd(dd, "%s\n", "");
spa_async_close(dd->dd_pool->dp_spa, tag);
@@ -422,8 +438,7 @@ getcomponent(const char *path, char *component, const char **nextp)
} else if (p[0] == '/') {
if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
- (void) strncpy(component, path, p - path);
- component[p - path] = '\0';
+ (void) strlcpy(component, path, p - path + 1);
p++;
} else if (p[0] == '@') {
/*
@@ -434,8 +449,7 @@ getcomponent(const char *path, char *component, const char **nextp)
return (SET_ERROR(EINVAL));
if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
- (void) strncpy(component, path, p - path);
- component[p - path] = '\0';
+ (void) strlcpy(component, path, p - path + 1);
} else {
panic("invalid p=%p", (void *)p);
}
@@ -451,7 +465,7 @@ getcomponent(const char *path, char *component, const char **nextp)
* (*tail)[0] == '@' means that the last component is a snapshot.
*/
int
-dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag,
dsl_dir_t **ddp, const char **tailp)
{
char *buf;
@@ -764,6 +778,8 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
*/
if (secpolicy_zfs_proc(cr, proc) == 0)
return (ENFORCE_NEVER);
+#else
+ (void) proc;
#endif
if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
@@ -801,7 +817,7 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
{
objset_t *os = dd->dd_pool->dp_meta_objset;
uint64_t limit, count;
- char *count_prop;
+ const char *count_prop;
enforce_res_t enforce;
int err = 0;
@@ -809,6 +825,18 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
prop == ZFS_PROP_SNAPSHOT_LIMIT);
+ if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+ /*
+ * We don't enforce the limit for temporary snapshots. This is
+ * indicated by a NULL cred_t argument.
+ */
+ if (cr == NULL)
+ return (0);
+
+ count_prop = DD_FIELD_SNAPSHOT_COUNT;
+ } else {
+ count_prop = DD_FIELD_FILESYSTEM_COUNT;
+ }
/*
* If we're allowed to change the limit, don't enforce the limit
* e.g. this can happen if a snapshot is taken by an administrative
@@ -828,19 +856,6 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
if (delta == 0)
return (0);
- if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
- /*
- * We don't enforce the limit for temporary snapshots. This is
- * indicated by a NULL cred_t argument.
- */
- if (cr == NULL)
- return (0);
-
- count_prop = DD_FIELD_SNAPSHOT_COUNT;
- } else {
- count_prop = DD_FIELD_FILESYSTEM_COUNT;
- }
-
/*
* If an ancestor has been provided, stop checking the limit once we
* hit that dir. We need this during rename so that we don't overcount
@@ -1172,10 +1187,9 @@ dsl_dir_space_towrite(dsl_dir_t *dd)
ASSERT(MUTEX_HELD(&dd->dd_lock));
- for (int i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++)
space += dd->dd_space_towrite[i & TXG_MASK];
- ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
- }
+
return (space);
}
@@ -1262,6 +1276,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
uint64_t quota;
struct tempreserve *tr;
int retval;
+ uint64_t ext_quota;
uint64_t ref_rsrv;
top_of_function:
@@ -1305,7 +1320,9 @@ top_of_function:
* If this transaction will result in a net free of space,
* we want to let it through.
*/
- if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
+ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 ||
+ (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL &&
+ zvol_enforce_quotas == B_FALSE))
quota = UINT64_MAX;
else
quota = dsl_dir_phys(dd)->dd_quota;
@@ -1320,7 +1337,6 @@ top_of_function:
* we're very close to full, this will allow a steady trickle of
* removes to get through.
*/
- uint64_t deferred = 0;
if (dd->dd_parent == NULL) {
uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
(netfree) ?
@@ -1335,21 +1351,31 @@ top_of_function:
/*
* If they are requesting more space, and our current estimate
* is over quota, they get to try again unless the actual
- * on-disk is over quota and there are no pending changes (which
- * may free up space for us).
+ * on-disk is over quota and there are no pending changes
+ * or deferred frees (which may free up space for us).
*/
- if (used_on_disk + est_inflight >= quota) {
- if (est_inflight > 0 || used_on_disk < quota ||
- (retval == ENOSPC && used_on_disk < quota + deferred))
- retval = ERESTART;
+ ext_quota = quota >> 5;
+ if (quota == UINT64_MAX)
+ ext_quota = 0;
+
+ if (used_on_disk >= quota) {
+ if (retval == ENOSPC && (used_on_disk - quota) <
+ dsl_pool_deferred_space(dd->dd_pool)) {
+ retval = SET_ERROR(ERESTART);
+ }
+ /* Quota exceeded */
+ mutex_exit(&dd->dd_lock);
+ DMU_TX_STAT_BUMP(dmu_tx_quota);
+ return (retval);
+ } else if (used_on_disk + est_inflight >= quota + ext_quota) {
dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
- "quota=%lluK tr=%lluK err=%d\n",
+ "quota=%lluK tr=%lluK\n",
(u_longlong_t)used_on_disk>>10,
(u_longlong_t)est_inflight>>10,
- (u_longlong_t)quota>>10, (u_longlong_t)asize>>10, retval);
+ (u_longlong_t)quota>>10, (u_longlong_t)asize>>10);
mutex_exit(&dd->dd_lock);
DMU_TX_STAT_BUMP(dmu_tx_quota);
- return (SET_ERROR(retval));
+ return (SET_ERROR(ERESTART));
}
/* We need to up our estimated delta before dropping dd_lock */
@@ -1377,10 +1403,9 @@ top_of_function:
ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
first = B_FALSE;
goto top_of_function;
-
- } else {
- return (0);
}
+
+ return (0);
}
/*
@@ -1459,7 +1484,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
if (tr_cookie == NULL)
return;
- while ((tr = list_head(tr_list)) != NULL) {
+ while ((tr = list_remove_head(tr_list)) != NULL) {
if (tr->tr_ds) {
mutex_enter(&tr->tr_ds->dd_lock);
ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
@@ -1469,7 +1494,6 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
} else {
arc_tempreserve_clear(tr->tr_size);
}
- list_remove(tr_list, tr);
kmem_free(tr, sizeof (struct tempreserve));
}
@@ -1896,10 +1920,10 @@ typedef struct dsl_valid_rename_arg {
int nest_delta;
} dsl_valid_rename_arg_t;
-/* ARGSUSED */
static int
dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
+ (void) dp;
dsl_valid_rename_arg_t *dvra = arg;
char namebuf[ZFS_MAX_DATASET_NAME_LEN];
@@ -2094,6 +2118,8 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
&mynewname));
+ ASSERT3P(mynewname, !=, NULL);
+
/* Log this before we change the name. */
spa_history_log_internal_dd(dd, "rename", tx,
"-> %s", ddra->ddra_newname);
@@ -2236,13 +2262,25 @@ dsl_dir_snap_cmtime(dsl_dir_t *dd)
}
void
-dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx)
{
+ dsl_pool_t *dp = dmu_tx_pool(tx);
inode_timespec_t t;
-
gethrestime(&t);
+
mutex_enter(&dd->dd_lock);
dd->dd_snap_cmtime = t;
+ if (spa_feature_is_enabled(dp->dp_spa,
+ SPA_FEATURE_EXTENSIBLE_DATASET)) {
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ uint64_t ddobj = dd->dd_object;
+ dsl_dir_zapify(dd, tx);
+ VERIFY0(zap_update(mos, ddobj,
+ DD_FIELD_SNAPSHOTS_CHANGED,
+ sizeof (uint64_t),
+ sizeof (inode_timespec_t) / sizeof (uint64_t),
+ &t, tx));
+ }
mutex_exit(&dd->dd_lock);
}
@@ -2396,6 +2434,7 @@ dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
* The delete queue is ZPL specific, and libzpool doesn't have
* it. It doesn't make sense to wait for it.
*/
+ (void) ds;
*in_progress = B_FALSE;
break;
#endif
@@ -2448,3 +2487,7 @@ dsl_dir_cancel_waiters(dsl_dir_t *dd)
EXPORT_SYMBOL(dsl_dir_set_quota);
EXPORT_SYMBOL(dsl_dir_set_reservation);
#endif
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW,
+ "Enable strict ZVOL quota enforcment");
diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c
index 1350f1329564..342ec5c15c79 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_pool.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -99,32 +99,31 @@
* capped at zfs_dirty_data_max_max. It can also be overridden with a module
* parameter.
*/
-unsigned long zfs_dirty_data_max = 0;
-unsigned long zfs_dirty_data_max_max = 0;
-int zfs_dirty_data_max_percent = 10;
-int zfs_dirty_data_max_max_percent = 25;
+uint64_t zfs_dirty_data_max = 0;
+uint64_t zfs_dirty_data_max_max = 0;
+uint_t zfs_dirty_data_max_percent = 10;
+uint_t zfs_dirty_data_max_max_percent = 25;
/*
- * zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
- * Once it is reached, write operation is blocked,
- * until log data is cleared out after txg sync.
+ * The upper limit of TX_WRITE log data. Write operations are throttled
+ * when approaching the limit until log data is cleared out after txg sync.
* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
*/
-unsigned long zfs_wrlog_data_max = 0;
+uint64_t zfs_wrlog_data_max = 0;
/*
* If there's at least this much dirty data (as a percentage of
* zfs_dirty_data_max), push out a txg. This should be less than
* zfs_vdev_async_write_active_min_dirty_percent.
*/
-int zfs_dirty_data_sync_percent = 20;
+static uint_t zfs_dirty_data_sync_percent = 20;
/*
* Once there is this amount of dirty data, the dmu_tx_delay() will kick in
* and delay each transaction.
* This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
*/
-int zfs_delay_min_dirty_percent = 60;
+uint_t zfs_delay_min_dirty_percent = 60;
/*
* This controls how quickly the delay approaches infinity.
@@ -139,12 +138,7 @@ int zfs_delay_min_dirty_percent = 60;
* Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
* multiply in dmu_tx_delay().
*/
-unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
-
-/*
- * This determines the number of threads used by the dp_sync_taskq.
- */
-int zfs_sync_taskq_batch_pct = 75;
+uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
/*
* These tunables determine the behavior of how zil_itxg_clean() is
@@ -172,9 +166,9 @@ int zfs_sync_taskq_batch_pct = 75;
* Additionally, the number of threads used by the taskq can be
* configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
*/
-int zfs_zil_clean_taskq_nthr_pct = 100;
-int zfs_zil_clean_taskq_minalloc = 1024;
-int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+static int zfs_zil_clean_taskq_nthr_pct = 100;
+static int zfs_zil_clean_taskq_minalloc = 1024;
+static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
@@ -215,9 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
txg_list_create(&dp->dp_early_sync_tasks, spa,
offsetof(dsl_sync_task_t, dst_node));
- dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
- zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
- TASKQ_THREADS_CPU_PCT);
+ dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq");
dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
zfs_zil_clean_taskq_nthr_pct, minclsyspri,
@@ -332,7 +324,6 @@ dsl_pool_open(dsl_pool_t *dp)
/*
* We might not have created the remap bpobj yet.
*/
- err = 0;
} else {
goto out;
}
@@ -411,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp)
txg_list_destroy(&dp->dp_dirty_dirs);
taskq_destroy(dp->dp_zil_clean_taskq);
- taskq_destroy(dp->dp_sync_taskq);
+ spa_sync_tq_destroy(dp->dp_spa);
/*
* We can't set retry to TRUE since we're explicitly specifying
@@ -439,10 +430,8 @@ dsl_pool_close(dsl_pool_t *dp)
taskq_destroy(dp->dp_unlinked_drain_taskq);
taskq_destroy(dp->dp_zrele_taskq);
- if (dp->dp_blkstats != NULL) {
- mutex_destroy(&dp->dp_blkstats->zab_lock);
+ if (dp->dp_blkstats != NULL)
vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
- }
kmem_free(dp, sizeof (dsl_pool_t));
}
@@ -476,8 +465,8 @@ dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
}
dsl_pool_t *
-dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
- uint64_t txg)
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)),
+ dsl_crypto_params_t *dcp, uint64_t txg)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
@@ -623,15 +612,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
/* Choose a value slightly bigger than min dirty sync bytes */
uint64_t sync_min =
- zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
+ zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200;
if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
txg_kick(dp, txg);
}
boolean_t
-dsl_pool_wrlog_over_max(dsl_pool_t *dp)
+dsl_pool_need_wrlog_delay(dsl_pool_t *dp)
{
- return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
+ uint64_t delay_min_bytes =
+ zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
+
+ return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0);
}
static void
@@ -641,6 +633,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
aggsum_add(&dp->dp_wrlog_total, delta);
+ /* Compact per-CPU sums after the big change. */
+ (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
+ (void) aggsum_value(&dp->dp_wrlog_total);
}
#ifdef ZFS_DEBUG
@@ -664,12 +659,15 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
return (B_TRUE);
}
+#else
+#define dsl_early_sync_task_verify(dp, txg) \
+ ((void) sizeof (dp), (void) sizeof (txg), B_TRUE)
#endif
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
- zio_t *zio;
+ zio_t *rio; /* root zio for all dirty dataset syncs */
dmu_tx_t *tx;
dsl_dir_t *dd;
dsl_dataset_t *ds;
@@ -699,9 +697,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
}
/*
- * Write out all dirty blocks of dirty datasets.
+ * Write out all dirty blocks of dirty datasets. Note, this could
+ * create a very large (+10k) zio tree.
*/
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
/*
* We must not sync any non-MOS datasets twice, because
@@ -710,9 +709,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
*/
ASSERT(!list_link_active(&ds->ds_synced_link));
list_insert_tail(&synced_datasets, ds);
- dsl_dataset_sync(ds, zio, tx);
+ dsl_dataset_sync(ds, rio, tx);
}
- VERIFY0(zio_wait(zio));
+ VERIFY0(zio_wait(rio));
/*
* Update the long range free counter after
@@ -743,13 +742,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* user accounting information (and we won't get confused
* about which blocks are part of the snapshot).
*/
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
objset_t *os = ds->ds_objset;
ASSERT(list_link_active(&ds->ds_synced_link));
dmu_buf_rele(ds->ds_dbuf, ds);
- dsl_dataset_sync(ds, zio, tx);
+ dsl_dataset_sync(ds, rio, tx);
/*
* Release any key mappings created by calls to
@@ -762,7 +761,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
}
}
- VERIFY0(zio_wait(zio));
+ VERIFY0(zio_wait(rio));
/*
* Now that the datasets have been completely synced, we can
@@ -783,6 +782,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
}
dsl_dataset_sync_done(ds, tx);
+ dmu_buf_rele(ds->ds_dbuf, ds);
}
while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
@@ -947,24 +947,30 @@ dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
return (quota);
}
+uint64_t
+dsl_pool_deferred_space(dsl_pool_t *dp)
+{
+ return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)));
+}
+
boolean_t
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
{
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
- mutex_enter(&dp->dp_lock);
- uint64_t dirty = dp->dp_dirty_total;
- mutex_exit(&dp->dp_lock);
-
- return (dirty > delay_min_bytes);
+ /*
+ * We are not taking the dp_lock here and few other places, since torn
+ * reads are unlikely: on 64-bit systems due to register size and on
+ * 32-bit due to memory constraints. Pool-wide locks in hot path may
+ * be too expensive, while we do not need a precise result here.
+ */
+ return (dp->dp_dirty_total > delay_min_bytes);
}
static boolean_t
dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
{
- ASSERT(MUTEX_HELD(&dp->dp_lock));
-
uint64_t dirty_min_bytes =
zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
@@ -1007,7 +1013,6 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
mutex_exit(&dp->dp_lock);
}
-/* ARGSUSED */
static int
upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
@@ -1042,7 +1047,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
* will be wrong.
*/
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+ ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
rrw_exit(&ds->ds_bp_rwlock, FTAG);
/* The origin doesn't get attached to itself */
@@ -1098,7 +1103,6 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
-/* ARGSUSED */
static int
upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
@@ -1377,7 +1381,7 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
*/
int
-dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
+dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp)
{
spa_t *spa;
int error;
@@ -1391,14 +1395,14 @@ dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
}
void
-dsl_pool_rele(dsl_pool_t *dp, void *tag)
+dsl_pool_rele(dsl_pool_t *dp, const void *tag)
{
dsl_pool_config_exit(dp, tag);
spa_close(dp->dp_spa, tag);
}
void
-dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
+dsl_pool_config_enter(dsl_pool_t *dp, const void *tag)
{
/*
* We use a "reentrant" reader-writer lock, but not reentrantly.
@@ -1417,14 +1421,14 @@ dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
}
void
-dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag)
{
ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
}
void
-dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
+dsl_pool_config_exit(dsl_pool_t *dp, const void *tag)
{
rrw_exit(&dp->dp_config_rwlock, tag);
}
@@ -1444,37 +1448,33 @@ dsl_pool_config_held_writer(dsl_pool_t *dp)
EXPORT_SYMBOL(dsl_pool_config_enter);
EXPORT_SYMBOL(dsl_pool_config_exit);
-/* BEGIN CSTYLED */
/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD,
"Max percent of RAM allowed to be dirty");
/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD,
"zfs_dirty_data_max upper bound as % of RAM");
-ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW,
"Transaction delay threshold");
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW,
"Determines the dirty space limit");
-ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW,
"The size limit of write-transaction zil log data");
/* zfs_dirty_data_max_max only applied at module load in arc_init(). */
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD,
"zfs_dirty_data_max upper bound in bytes");
-ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW,
"Dirty data txg sync threshold as a percentage of zfs_dirty_data_max");
-ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW,
"How quickly delay approaches infinity");
-ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
- "Max percent of CPUs that are used to sync dirty data");
-
ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
"Max percent of CPUs that are used per dp_sync_taskq");
@@ -1483,4 +1483,3 @@ ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW,
"Max number of taskq entries that are cached");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c
index dfa04d7681be..99f931cd8632 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_prop.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -23,6 +23,7 @@
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 Martin Matuska. All rights reserved.
* Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
*/
#include <sys/zfs_context.h>
@@ -41,6 +42,7 @@
#define ZPROP_INHERIT_SUFFIX "$inherit"
#define ZPROP_RECVD_SUFFIX "$recvd"
+#define ZPROP_IUV_SUFFIX "$iuv"
static int
dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
@@ -57,7 +59,7 @@ dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
if (intsz != 1)
return (SET_ERROR(EOVERFLOW));
- (void) strncpy(buf, zfs_prop_default_string(prop),
+ (void) strlcpy(buf, zfs_prop_default_string(prop),
numints);
} else {
if (intsz != 8 || numints < 1)
@@ -69,6 +71,17 @@ dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
return (0);
}
+static int
+dsl_prop_known_index(zfs_prop_t prop, uint64_t value)
+{
+ const char *str = NULL;
+ if (prop != ZPROP_CONT && prop != ZPROP_INVAL &&
+ zfs_prop_get_type(prop) == PROP_TYPE_INDEX)
+ return (!zfs_prop_index_to_string(prop, value, &str));
+
+ return (-1);
+}
+
int
dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
@@ -81,6 +94,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
boolean_t inheriting = B_FALSE;
char *inheritstr;
char *recvdstr;
+ char *iuvstr;
ASSERT(dsl_pool_config_held(dd->dd_pool));
@@ -88,9 +102,10 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
setpoint[0] = '\0';
prop = zfs_name_to_prop(propname);
- inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop));
inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+ iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX);
/*
* Note: dd may become NULL, therefore we shouldn't dereference it
@@ -105,6 +120,18 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
inheriting = B_TRUE;
}
+ /* Check for a iuv value. */
+ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ iuvstr, intsz, numints, buf);
+ if (err == 0 && dsl_prop_known_index(prop,
+ *(uint64_t *)buf) != 1)
+ err = ENOENT;
+ if (err != ENOENT) {
+ if (setpoint != NULL && err == 0)
+ dsl_dir_name(dd, setpoint);
+ break;
+ }
+
/* Check for a local value. */
err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
propname, intsz, numints, buf);
@@ -155,6 +182,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
kmem_strfree(inheritstr);
kmem_strfree(recvdstr);
+ kmem_strfree(iuvstr);
return (err);
}
@@ -168,7 +196,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
uint64_t zapobj;
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
- inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+ inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop));
zapobj = dsl_dataset_phys(ds)->ds_props_obj;
if (zapobj != 0) {
@@ -504,10 +532,10 @@ dsl_prop_hascb(dsl_dataset_t *ds)
return (!list_is_empty(&ds->ds_prop_cbs));
}
-/* ARGSUSED */
static int
dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
+ (void) arg;
dsl_dir_t *dd = ds->ds_dir;
dsl_prop_record_t *pr;
dsl_prop_cb_record_t *cbr;
@@ -647,6 +675,45 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
dsl_dir_rele(dd, FTAG);
}
+
+/*
+ * For newer values in zfs index type properties, we add a new key
+ * propname$iuv (iuv = Ignore Unknown Values) to the properties zap object
+ * to store the new property value and store the default value in the
+ * existing prop key. So that the propname$iuv key is ignored by the older zfs
+ * versions and the default property value from the existing prop key is
+ * used.
+ */
+static void
+dsl_prop_set_iuv(objset_t *mos, uint64_t zapobj, const char *propname,
+ int intsz, int numints, const void *value, dmu_tx_t *tx)
+{
+ char *iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX);
+ boolean_t iuv = B_FALSE;
+ zfs_prop_t prop = zfs_name_to_prop(propname);
+
+ switch (prop) {
+ case ZFS_PROP_REDUNDANT_METADATA:
+ if (*(uint64_t *)value == ZFS_REDUNDANT_METADATA_SOME ||
+ *(uint64_t *)value == ZFS_REDUNDANT_METADATA_NONE)
+ iuv = B_TRUE;
+ break;
+ default:
+ break;
+ }
+
+ if (iuv) {
+ VERIFY0(zap_update(mos, zapobj, iuvstr, intsz, numints,
+ value, tx));
+ uint64_t val = zfs_prop_default_numeric(prop);
+ VERIFY0(zap_update(mos, zapobj, propname, intsz, numints,
+ &val, tx));
+ } else {
+ zap_remove(mos, zapobj, iuvstr, tx);
+ }
+ kmem_strfree(iuvstr);
+}
+
void
dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
zprop_source_t source, int intsz, int numints, const void *value,
@@ -659,6 +726,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
const char *valstr = NULL;
char *inheritstr;
char *recvdstr;
+ char *iuvstr;
char *tbuf = NULL;
int err;
uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
@@ -692,6 +760,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+ iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX);
switch ((int)source) {
case ZPROP_SRC_NONE:
@@ -709,11 +778,14 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
/*
* remove propname$inherit
* set propname -> value
+ * set propname$iuv -> new property value
*/
err = zap_remove(mos, zapobj, inheritstr, tx);
ASSERT(err == 0 || err == ENOENT);
VERIFY0(zap_update(mos, zapobj, propname,
intsz, numints, value, tx));
+ (void) dsl_prop_set_iuv(mos, zapobj, propname, intsz,
+ numints, value, tx);
break;
case ZPROP_SRC_INHERITED:
/*
@@ -723,6 +795,8 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
*/
err = zap_remove(mos, zapobj, propname, tx);
ASSERT(err == 0 || err == ENOENT);
+ err = zap_remove(mos, zapobj, iuvstr, tx);
+ ASSERT(err == 0 || err == ENOENT);
if (version >= SPA_VERSION_RECVD_PROPS &&
dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
dummy = 0;
@@ -749,7 +823,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
ASSERT(err == 0 || err == ENOENT);
err = zap_remove(mos, zapobj, inheritstr, tx);
ASSERT(err == 0 || err == ENOENT);
- fallthrough;
+ zfs_fallthrough;
case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
/*
* remove propname$recvd
@@ -763,6 +837,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
kmem_strfree(inheritstr);
kmem_strfree(recvdstr);
+ kmem_strfree(iuvstr);
/*
* If we are left with an empty snap zap we can destroy it.
@@ -881,7 +956,7 @@ dsl_props_set_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(ENAMETOOLONG));
}
if (nvpair_type(elem) == DATA_TYPE_STRING) {
- char *valstr = fnvpair_value_string(elem);
+ const char *valstr = fnvpair_value_string(elem);
if (strlen(valstr) >= (version <
SPA_VERSION_STMF_PROP ?
ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
@@ -1012,6 +1087,14 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
propname = za.za_name;
source = setpoint;
+
+ /* Skip if iuv entries are preset. */
+ valstr = kmem_asprintf("%s%s", propname,
+ ZPROP_IUV_SUFFIX);
+ err = zap_contains(mos, propobj, valstr);
+ kmem_strfree(valstr);
+ if (err == 0)
+ continue;
} else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
/* Skip explicitly inherited entries. */
continue;
@@ -1019,8 +1102,8 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
if (flags & DSL_PROP_GET_LOCAL)
continue;
- (void) strncpy(buf, za.za_name, (suffix - za.za_name));
- buf[suffix - za.za_name] = '\0';
+ (void) strlcpy(buf, za.za_name,
+ MIN(sizeof (buf), suffix - za.za_name + 1));
propname = buf;
if (!(flags & DSL_PROP_GET_RECEIVED)) {
@@ -1044,6 +1127,16 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
source = ((flags & DSL_PROP_GET_INHERITING) ?
setpoint : ZPROP_SOURCE_VAL_RECVD);
+ } else if (strcmp(suffix, ZPROP_IUV_SUFFIX) == 0) {
+ (void) strlcpy(buf, za.za_name,
+ MIN(sizeof (buf), suffix - za.za_name + 1));
+ propname = buf;
+ source = setpoint;
+ prop = zfs_name_to_prop(propname);
+
+ if (dsl_prop_known_index(prop,
+ za.za_first_integer) != 1)
+ continue;
} else {
/*
* For backward compatibility, skip suffixes we don't
@@ -1055,12 +1148,12 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
prop = zfs_name_to_prop(propname);
/* Skip non-inheritable properties. */
- if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
- !zfs_prop_inheritable(prop))
+ if ((flags & DSL_PROP_GET_INHERITING) &&
+ prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop))
continue;
/* Skip properties not valid for this type. */
- if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_USERPROP &&
!zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE))
continue;
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
index d25c067dfbc1..085cfd3c5691 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -37,6 +37,7 @@
#include <sys/dmu_tx.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
+#include <sys/arc_impl.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
@@ -46,12 +47,14 @@
#include <sys/vdev_impl.h>
#include <sys/zil_impl.h>
#include <sys/zio_checksum.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#include <sys/range_tree.h>
+#include <sys/dbuf.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
@@ -126,9 +129,20 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
-static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
+static uint64_t dsl_scan_count_data_disks(spa_t *spa);
+static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
-extern int zfs_vdev_async_write_active_min_dirty_percent;
+extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
+static int zfs_scan_blkstats = 0;
+
+/*
+ * 'zpool status' uses bytes processed per pass to report throughput and
+ * estimate time remaining. We define a pass to start when the scanning
+ * phase completes for a sequential resilver. Optionally, this value
+ * may be used to reset the pass statistics every N txgs to provide an
+ * estimated completion time based on currently observed performance.
+ */
+static uint_t zfs_scan_report_txgs = 0;
/*
* By default zfs will check to ensure it is not over the hard memory
@@ -136,7 +150,7 @@ extern int zfs_vdev_async_write_active_min_dirty_percent;
* this value can be set to 1 to enable checking before scanning each
* block.
*/
-int zfs_scan_strict_mem_lim = B_FALSE;
+static int zfs_scan_strict_mem_lim = B_FALSE;
/*
* Maximum number of parallelly executed bytes per leaf vdev. We attempt
@@ -146,41 +160,57 @@ int zfs_scan_strict_mem_lim = B_FALSE;
* overload the drives with I/O, since that is protected by
* zfs_vdev_scrub_max_active.
*/
-unsigned long zfs_scan_vdev_limit = 4 << 20;
+static uint64_t zfs_scan_vdev_limit = 16 << 20;
+
+static uint_t zfs_scan_issue_strategy = 0;
-int zfs_scan_issue_strategy = 0;
-int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
-unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+/* don't queue & sort zios, go direct */
+static int zfs_scan_legacy = B_FALSE;
+static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
/*
* fill_weight is non-tunable at runtime, so we copy it at module init from
* zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
* break queue sorting.
*/
-int zfs_scan_fill_weight = 3;
+static uint_t zfs_scan_fill_weight = 3;
static uint64_t fill_weight;
/* See dsl_scan_should_clear() for details on the memory limit tunables */
-uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
-uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
-int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */
-int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */
-
-int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
-int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
-int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
-int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
-int zfs_scan_checkpoint_intval = 7200; /* in seconds */
+static const uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
+static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
+
+
+/* fraction of physmem */
+static uint_t zfs_scan_mem_lim_fact = 20;
+
+/* fraction of mem lim above */
+static uint_t zfs_scan_mem_lim_soft_fact = 20;
+
+/* minimum milliseconds to scrub per txg */
+static uint_t zfs_scrub_min_time_ms = 1000;
+
+/* minimum milliseconds to obsolete per txg */
+static uint_t zfs_obsolete_min_time_ms = 500;
+
+/* minimum milliseconds to free per txg */
+static uint_t zfs_free_min_time_ms = 1000;
+
+/* minimum milliseconds to resilver per txg */
+static uint_t zfs_resilver_min_time_ms = 3000;
+
+static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */
int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
-int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
-enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
+static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
-unsigned long zfs_async_block_max_blocks = ULONG_MAX;
+static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
/* max number of dedup blocks to free in a single TXG */
-unsigned long zfs_max_async_dedup_frees = 100000;
+static uint64_t zfs_max_async_dedup_frees = 100000;
-int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+/* set to disable resilver deferring */
+static int zfs_resilver_disable_defer = B_FALSE;
/*
* We wait a few txgs after importing a pool to begin scanning so that
@@ -201,7 +231,10 @@ int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
/*
* Enable/disable the processing of the free_bpobj object.
*/
-int zfs_free_bpobj_enabled = 1;
+static int zfs_free_bpobj_enabled = 1;
+
+/* Error blocks to be scrubbed in one txg. */
+static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
/* the order has to match pool_scan_type */
static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
@@ -219,9 +252,9 @@ typedef struct {
/*
* This controls what conditions are placed on dsl_scan_sync_state():
- * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
- * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
- * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
+ * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
* write out the scn_phys_cached version.
* See dsl_scan_sync_state for details.
*/
@@ -264,7 +297,7 @@ typedef struct scan_io {
* event of an error. This array must go at the end of the
* struct to allow this for the variable number of elements.
*/
- dva_t sio_dva[0];
+ dva_t sio_dva[];
} scan_io_t;
#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
@@ -279,12 +312,14 @@ typedef struct scan_io {
struct dsl_scan_io_queue {
dsl_scan_t *q_scn; /* associated dsl_scan_t */
vdev_t *q_vd; /* top-level vdev that this queue represents */
+ zio_t *q_zio; /* scn_zio_root child for waiting on IO */
/* trees used for sorting I/Os and extents of I/Os */
range_tree_t *q_exts_by_addr;
- zfs_btree_t q_exts_by_size;
+ zfs_btree_t q_exts_by_size;
avl_tree_t q_sios_by_addr;
uint64_t q_sio_memused;
+ uint64_t q_last_ext_addr;
/* members for zio rate limiting */
uint64_t q_maxinflight_bytes;
@@ -392,25 +427,25 @@ dsl_scan_resilvering(dsl_pool_t *dp)
static inline void
sio2bp(const scan_io_t *sio, blkptr_t *bp)
{
- bzero(bp, sizeof (*bp));
+ memset(bp, 0, sizeof (*bp));
bp->blk_prop = sio->sio_blk_prop;
- bp->blk_phys_birth = sio->sio_phys_birth;
- bp->blk_birth = sio->sio_birth;
+ BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
+ BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
bp->blk_fill = 1; /* we always only work with data pointers */
bp->blk_cksum = sio->sio_cksum;
ASSERT3U(sio->sio_nr_dvas, >, 0);
ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
- bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
+ memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t));
}
static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
sio->sio_blk_prop = bp->blk_prop;
- sio->sio_phys_birth = bp->blk_phys_birth;
- sio->sio_birth = bp->blk_birth;
+ sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
+ sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
sio->sio_cksum = bp->blk_cksum;
sio->sio_nr_dvas = BP_GET_NDVAS(bp);
@@ -447,14 +482,16 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
/*
* Calculate the max number of in-flight bytes for pool-wide
- * scanning operations (minimum 1MB). Limits for the issuing
- * phase are done per top-level vdev and are handled separately.
+ * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+ * Limits for the issuing phase are done per top-level vdev and
+ * are handled separately.
*/
- scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
- dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+ zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
offsetof(scan_ds_t, sds_node));
+ mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
sizeof (scan_prefetch_issue_ctx_t),
offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
@@ -481,8 +518,16 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
&scn->scn_phys.scn_queue_obj);
} else {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
+ ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
+
+ if (err != 0 && err != ENOENT)
+ return (err);
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
&scn->scn_phys);
+
/*
* Detect if the pool contains the signature of #2094. If it
* does properly update the scn->scn_phys structure and notify
@@ -507,7 +552,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
return (EOVERFLOW);
}
- bcopy(zaptmp, &scn->scn_phys,
+ memcpy(&scn->scn_phys, zaptmp,
SCAN_PHYS_NUMINTS * sizeof (uint64_t));
scn->scn_phys.scn_flags = overflow;
@@ -529,7 +574,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
* counter to how far we've scanned. We know we're consistent
* up to here.
*/
- scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+ scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
+ scn->scn_phys.scn_skipped;
if (dsl_scan_is_running(scn) &&
spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
@@ -566,7 +612,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
}
}
- bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+ memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
/* reload the queue into the in-core state */
if (scn->scn_phys.scn_queue_obj != 0) {
@@ -585,6 +631,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
}
spa_scan_stat_init(spa);
+ vdev_scan_stat_init(spa->spa_root_vdev);
+
return (0);
}
@@ -599,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp)
scan_ds_queue_clear(scn);
avl_destroy(&scn->scn_queue);
+ mutex_destroy(&scn->scn_queue_lock);
scan_ds_prefetch_queue_clear(scn);
avl_destroy(&scn->scn_prefetch_queue);
@@ -631,18 +680,96 @@ dsl_scan_scrubbing(const dsl_pool_t *dp)
}
boolean_t
+dsl_errorscrubbing(const dsl_pool_t *dp)
+{
+ dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
+
+ return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
+ errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
+}
+
+boolean_t
+dsl_errorscrub_is_paused(const dsl_scan_t *scn)
+{
+ return (dsl_errorscrubbing(scn->scn_dp) &&
+ scn->errorscrub_phys.dep_paused_flags);
+}
+
+boolean_t
dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
{
return (dsl_scan_scrubbing(scn->scn_dp) &&
scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
}
+static void
+dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ scn->errorscrub_phys.dep_cursor =
+ zap_cursor_serialize(&scn->errorscrub_cursor);
+
+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
+ &scn->errorscrub_phys, tx));
+}
+
+static void
+dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+ pool_scan_func_t *funcp = arg;
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(!dsl_scan_is_running(scn));
+ ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+ ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+
+ memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+ scn->errorscrub_phys.dep_func = *funcp;
+ scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
+ scn->errorscrub_phys.dep_start_time = gethrestime_sec();
+ scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
+ scn->errorscrub_phys.dep_examined = 0;
+ scn->errorscrub_phys.dep_errors = 0;
+ scn->errorscrub_phys.dep_cursor = 0;
+ zap_cursor_init_serialized(&scn->errorscrub_cursor,
+ spa->spa_meta_objset, spa->spa_errlog_last,
+ scn->errorscrub_phys.dep_cursor);
+
+ vdev_config_dirty(spa->spa_root_vdev);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
+
+ dsl_errorscrub_sync_state(scn, tx);
+
+ spa_history_log_internal(spa, "error scrub setup", tx,
+ "func=%u mintxg=%u maxtxg=%llu",
+ *funcp, 0, (u_longlong_t)tx->tx_txg);
+}
+
+static int
+dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
+{
+ (void) arg;
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+ if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
+ return (SET_ERROR(EBUSY));
+ }
+
+ if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
+ return (ECANCELED);
+ }
+ return (0);
+}
+
/*
* Writes out a persistent dsl_scan_phys_t record to the pool directory.
* Because we can be running in the block sorting algorithm, we do not always
* want to write out the record, only when it is "safe" to do so. This safety
* condition is achieved by making sure that the sorting queues are empty
- * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * (scn_queues_pending == 0). When this condition is not true, the sync'd state
* is inconsistent with how much actual scanning progress has been made. The
* kind of sync to be performed is specified by the sync_type argument. If the
* sync is optional, we only sync if the queues are empty. If the sync is
@@ -665,8 +792,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
int i;
spa_t *spa = scn->scn_dp->dp_spa;
- ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
- if (scn->scn_bytes_pending == 0) {
+ ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
+ if (scn->scn_queues_pending == 0) {
for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
@@ -688,7 +815,7 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
&scn->scn_phys, tx));
- bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+ memcpy(&scn->scn_phys_cached, &scn->scn_phys,
sizeof (scn->scn_phys));
if (scn->scn_checkpointing)
@@ -705,14 +832,15 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
}
}
-/* ARGSUSED */
int
dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
- if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
+ if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
+ dsl_errorscrubbing(scn->scn_dp))
return (SET_ERROR(EBUSY));
return (0);
@@ -721,6 +849,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
void
dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
pool_scan_func_t *funcp = arg;
dmu_object_type_t ot = 0;
@@ -729,7 +858,15 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
ASSERT(!dsl_scan_is_running(scn));
ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
- bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+ memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
+
+ /*
+ * If we are starting a fresh scrub, we erase the error scrub
+ * information from disk.
+ */
+ memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+ dsl_errorscrub_sync_state(scn, tx);
+
scn->scn_phys.scn_func = *funcp;
scn->scn_phys.scn_state = DSS_SCANNING;
scn->scn_phys.scn_min_txg = 0;
@@ -744,6 +881,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
scn->scn_last_checkpoint = 0;
scn->scn_checkpointing = B_FALSE;
spa_scan_stat_init(spa);
+ vdev_scan_stat_init(spa->spa_root_vdev);
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
@@ -791,13 +929,19 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
/* back to the generic stuff */
- if (dp->dp_blkstats == NULL) {
- dp->dp_blkstats =
- vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
- mutex_init(&dp->dp_blkstats->zab_lock, NULL,
- MUTEX_DEFAULT, NULL);
+ if (zfs_scan_blkstats) {
+ if (dp->dp_blkstats == NULL) {
+ dp->dp_blkstats =
+ vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ }
+ memset(&dp->dp_blkstats->zab_type, 0,
+ sizeof (dp->dp_blkstats->zab_type));
+ } else {
+ if (dp->dp_blkstats) {
+ vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+ dp->dp_blkstats = NULL;
+ }
}
- bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
ot = DMU_OT_ZAP_OTHER;
@@ -805,7 +949,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
- bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+ memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
@@ -816,8 +960,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
}
/*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
+ * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
+ * error scrub or resilver. Can also be called to resume a paused scrub or
+ * error scrub.
*/
int
dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
@@ -843,6 +988,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
return (0);
}
+ if (func == POOL_SCAN_ERRORSCRUB) {
+ if (dsl_errorscrub_is_paused(dp->dp_scan)) {
+ /*
+ * got error scrub start cmd, resume paused error scrub.
+ */
+ int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+ POOL_SCRUB_NORMAL);
+ if (err == 0) {
+ spa_event_notify(spa, NULL, NULL,
+ ESC_ZFS_ERRORSCRUB_RESUME);
+ return (ECANCELED);
+ }
+ return (SET_ERROR(err));
+ }
+
+ return (dsl_sync_task(spa_name(dp->dp_spa),
+ dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
+ &func, 0, ZFS_SPACE_CHECK_RESERVED));
+ }
+
if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
/* got scrub start cmd, resume paused scrub */
int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -851,7 +1016,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
return (SET_ERROR(ECANCELED));
}
-
return (SET_ERROR(err));
}
@@ -859,7 +1023,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
-/* ARGSUSED */
+static void
+dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+
+ if (complete) {
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
+ spa_history_log_internal(spa, "error scrub done", tx,
+ "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+ } else {
+ spa_history_log_internal(spa, "error scrub canceled", tx,
+ "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+ }
+
+ scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
+ spa->spa_scrub_active = B_FALSE;
+ spa_errlog_rotate(spa);
+ scn->errorscrub_phys.dep_end_time = gethrestime_sec();
+ zap_cursor_fini(&scn->errorscrub_cursor);
+
+ if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
+ spa->spa_errata = 0;
+
+ ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+}
+
static void
dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
{
@@ -920,13 +1110,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
if (dsl_scan_restarting(scn, tx))
spa_history_log_internal(spa, "scan aborted, restarting", tx,
- "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+ "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
else if (!complete)
spa_history_log_internal(spa, "scan cancelled", tx,
- "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+ "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
else
spa_history_log_internal(spa, "scan done", tx,
- "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+ "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
spa->spa_scrub_active = B_FALSE;
@@ -989,7 +1179,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
spa_history_log_internal(spa,
"starting deferred resilver", tx, "errors=%llu",
- (u_longlong_t)spa_get_errlog_size(spa));
+ (u_longlong_t)spa_approx_errlog_size(spa));
spa_async_request(spa, SPA_ASYNC_RESILVER);
}
@@ -1006,10 +1196,96 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
ASSERT(!dsl_scan_is_running(scn));
}
-/* ARGSUSED */
+static int
+dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
+{
+ pool_scrub_cmd_t *cmd = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (*cmd == POOL_SCRUB_PAUSE) {
+ /*
+ * can't pause a error scrub when there is no in-progress
+ * error scrub.
+ */
+ if (!dsl_errorscrubbing(dp))
+ return (SET_ERROR(ENOENT));
+
+ /* can't pause a paused error scrub */
+ if (dsl_errorscrub_is_paused(scn))
+ return (SET_ERROR(EBUSY));
+ } else if (*cmd != POOL_SCRUB_NORMAL) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ return (0);
+}
+
+static void
+dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
+{
+ pool_scrub_cmd_t *cmd = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (*cmd == POOL_SCRUB_PAUSE) {
+ spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
+ scn->errorscrub_phys.dep_paused_flags = B_TRUE;
+ dsl_errorscrub_sync_state(scn, tx);
+ spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
+ } else {
+ ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+ if (dsl_errorscrub_is_paused(scn)) {
+ /*
+ * We need to keep track of how much time we spend
+ * paused per pass so that we can adjust the error scrub
+ * rate shown in the output of 'zpool status'.
+ */
+ spa->spa_scan_pass_errorscrub_spent_paused +=
+ gethrestime_sec() -
+ spa->spa_scan_pass_errorscrub_pause;
+
+ spa->spa_scan_pass_errorscrub_pause = 0;
+ scn->errorscrub_phys.dep_paused_flags = B_FALSE;
+
+ zap_cursor_init_serialized(
+ &scn->errorscrub_cursor,
+ spa->spa_meta_objset, spa->spa_errlog_last,
+ scn->errorscrub_phys.dep_cursor);
+
+ dsl_errorscrub_sync_state(scn, tx);
+ }
+ }
+}
+
+static int
+dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
+{
+ (void) arg;
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+ /* can't cancel a error scrub when there is no one in-progress */
+ if (!dsl_errorscrubbing(scn->scn_dp))
+ return (SET_ERROR(ENOENT));
+ return (0);
+}
+
+static void
+dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+ (void) arg;
+ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+ dsl_errorscrub_done(scn, B_FALSE, tx);
+ dsl_errorscrub_sync_state(scn, tx);
+ spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
+ ESC_ZFS_ERRORSCRUB_ABORT);
+}
+
static int
dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
if (!dsl_scan_is_running(scn))
@@ -1017,10 +1293,10 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
static void
dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
dsl_scan_done(scn, B_FALSE, tx);
@@ -1031,6 +1307,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
int
dsl_scan_cancel(dsl_pool_t *dp)
{
+ if (dsl_errorscrubbing(dp)) {
+ return (dsl_sync_task(spa_name(dp->dp_spa),
+ dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
+ NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+ }
return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
}
@@ -1097,6 +1378,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
int
dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
{
+ if (dsl_errorscrubbing(dp)) {
+ return (dsl_sync_task(spa_name(dp->dp_spa),
+ dsl_errorscrub_pause_resume_check,
+ dsl_errorscrub_pause_resume_sync, &cmd, 3,
+ ZFS_SPACE_CHECK_RESERVED));
+ }
return (dsl_sync_task(spa_name(dp->dp_spa),
dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
ZFS_SPACE_CHECK_RESERVED));
@@ -1204,7 +1491,7 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
- ASSERT0(scn->scn_bytes_pending);
+ ASSERT0(scn->scn_queues_pending);
ASSERT(scn->scn_phys.scn_queue_obj != 0);
VERIFY0(dmu_object_free(dp->dp_meta_objset,
@@ -1275,9 +1562,13 @@ dsl_scan_should_clear(dsl_scan_t *scn)
mutex_enter(&tvd->vdev_scan_io_queue_lock);
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
- /* # extents in exts_by_size = # in exts_by_addr */
+ /*
+ * # of extents in exts_by_addr = # in exts_by_size.
+ * B-tree efficiency is ~75%, but can be as low as 50%.
+ */
mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
- sizeof (range_seg_gap_t) + queue->q_sio_memused;
+ ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) *
+ 3 / 2) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
}
@@ -1285,7 +1576,7 @@ dsl_scan_should_clear(dsl_scan_t *scn)
dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
if (mused == 0)
- ASSERT0(scn->scn_bytes_pending);
+ ASSERT0(scn->scn_queues_pending);
/*
* If we are above our hard limit, we need to clear out memory.
@@ -1335,12 +1626,13 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
uint64_t sync_time_ns = curr_time_ns -
scn->scn_dp->dp_spa->spa_sync_starttime;
- int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
- int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ uint64_t dirty_min_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_min_dirty_percent / 100;
+ uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
if ((NSEC2MSEC(scan_time_ns) > mintime &&
- (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
txg_sync_waiting(scn->scn_dp) ||
NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
spa_shutting_down(scn->scn_dp->dp_spa) ||
@@ -1378,16 +1670,52 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
return (B_FALSE);
}
+static boolean_t
+dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+ /*
+ * We suspend if:
+ * - we have scrubbed for at least the minimum time (default 1 sec
+ * for error scrub), someone is explicitly waiting for this txg
+ * to complete, or we have used up all of the time in the txg
+ * timeout (default 5 sec).
+ * or
+ * - the spa is shutting down because this pool is being exported
+ * or the machine is rebooting.
+ */
+ uint64_t curr_time_ns = gethrtime();
+ uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
+ uint64_t sync_time_ns = curr_time_ns -
+ scn->scn_dp->dp_spa->spa_sync_starttime;
+ int mintime = zfs_scrub_min_time_ms;
+
+ if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
+ (txg_sync_waiting(scn->scn_dp) ||
+ NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa)) {
+ if (zb) {
+ dprintf("error scrub suspending at bookmark "
+ "%llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ }
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
typedef struct zil_scan_arg {
dsl_pool_t *zsa_dp;
zil_header_t *zsa_zh;
} zil_scan_arg_t;
-/* ARGSUSED */
static int
dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
uint64_t claim_txg)
{
+ (void) zilog;
zil_scan_arg_t *zsa = arg;
dsl_pool_t *dp = zsa->zsa_dp;
dsl_scan_t *scn = dp->dp_scan;
@@ -1395,7 +1723,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
zbookmark_phys_t zb;
ASSERT(!BP_IS_REDACTED(bp));
- if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ if (BP_IS_HOLE(bp) ||
+ BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@@ -1404,7 +1733,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
* (on-disk) even if it hasn't been claimed (even though for
* scrub there's nothing to do to it).
*/
- if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+ if (claim_txg == 0 &&
+ BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
return (0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1414,11 +1744,11 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
return (0);
}
-/* ARGSUSED */
static int
dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
uint64_t claim_txg)
{
+ (void) zilog;
if (lrc->lrc_txtype == TX_WRITE) {
zil_scan_arg_t *zsa = arg;
dsl_pool_t *dp = zsa->zsa_dp;
@@ -1430,7 +1760,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
ASSERT(!BP_IS_REDACTED(bp));
if (BP_IS_HOLE(bp) ||
- bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@@ -1438,9 +1768,10 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
* already txg sync'ed (but this log block contains
* other records that are not synced)
*/
- if (claim_txg == 0 || bp->blk_birth < claim_txg)
+ if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
return (0);
+ ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));
@@ -1491,7 +1822,7 @@ scan_prefetch_queue_compare(const void *a, const void *b)
}
static void
-scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag)
{
if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
zfs_refcount_destroy(&spc->spc_refcnt);
@@ -1500,7 +1831,7 @@ scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
}
static scan_prefetch_ctx_t *
-scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag)
{
scan_prefetch_ctx_t *spc;
@@ -1522,7 +1853,7 @@ scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
}
static void
-scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag)
{
zfs_refcount_add(&spc->spc_refcnt, tag);
}
@@ -1576,7 +1907,8 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
return;
- if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+ if (BP_IS_HOLE(bp) ||
+ BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
BP_GET_TYPE(bp) != DMU_OT_OBJSET))
return;
@@ -1643,6 +1975,7 @@ static void
dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *private)
{
+ (void) zio;
scan_prefetch_ctx_t *spc = private;
dsl_scan_t *scn = spc->spc_scn;
spa_t *spa = scn->scn_dp->dp_spa;
@@ -1687,6 +2020,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
zb->zb_objset, DMU_META_DNODE_OBJECT);
if (OBJSET_BUF_HAS_USERUSED(buf)) {
+ if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
+ dsl_scan_prefetch_dnode(scn,
+ &osp->os_projectused_dnode, zb->zb_objset,
+ DMU_PROJECTUSED_OBJECT);
+ }
dsl_scan_prefetch_dnode(scn,
&osp->os_groupused_dnode, zb->zb_objset,
DMU_GROUPUSED_OBJECT);
@@ -1702,7 +2040,6 @@ out:
scan_prefetch_ctx_rele(spc, scn);
}
-/* ARGSUSED */
static void
dsl_scan_prefetch_thread(void *arg)
{
@@ -1748,10 +2085,16 @@ dsl_scan_prefetch_thread(void *arg)
zio_flags |= ZIO_FLAG_RAW;
}
+ /* We don't need data L1 buffer since we do not prefetch L0. */
+ blkptr_t *bp = &spic->spic_bp;
+ if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ flags |= ARC_FLAG_NO_BUF;
+
/* issue the prefetch asynchronously */
- (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
- &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
- ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+ (void) arc_read(scn->scn_zio_root, spa, bp,
+ dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
+ zio_flags, &flags, &spic->spic_zb);
kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
}
@@ -1788,24 +2131,23 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
/*
* If we found the block we're trying to resume from, or
- * we went past it to a different object, zero it out to
- * indicate that it's OK to start checking for suspending
- * again.
+ * we went past it, zero it out to indicate that it's OK
+ * to start checking for suspending again.
*/
- if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
- zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+ if (zbookmark_subtree_tbd(dnp, zb,
+ &scn->scn_phys.scn_bookmark)) {
dprintf("resuming at %llx/%llx/%llx/%llx\n",
(longlong_t)zb->zb_objset,
(longlong_t)zb->zb_object,
(longlong_t)zb->zb_level,
(longlong_t)zb->zb_blkid);
- bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+ memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb));
}
}
return (B_FALSE);
}
-static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
dmu_objset_type_t ostype, dmu_tx_t *tx);
inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
@@ -1822,11 +2164,25 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
const zbookmark_phys_t *zb, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
int err;
ASSERT(!BP_IS_REDACTED(bp));
+ /*
+ * There is an unlikely case of encountering dnodes with contradicting
+ * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created
+ * or modified before commit 4254acb was merged. As it is not possible
+ * to know which of the two is correct, report an error.
+ */
+ if (dnp != NULL &&
+ dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
+ scn->scn_phys.scn_errors++;
+ spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+ return (SET_ERROR(EINVAL));
+ }
+
if (BP_GET_LEVEL(bp) > 0) {
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
@@ -1834,7 +2190,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
arc_buf_t *buf;
- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
@@ -1862,7 +2218,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
zio_flags |= ZIO_FLAG_RAW;
}
- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
@@ -1881,7 +2237,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
objset_phys_t *osp;
arc_buf_t *buf;
- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+ err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
@@ -1912,6 +2268,15 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
DMU_USERUSED_OBJECT, tx);
}
arc_buf_destroy(buf, &buf);
+ } else if (!zfs_blkptr_verify(spa, bp,
+ BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
+ /*
+ * Sanity check the block pointer contents, this is handled
+ * by arc_read() for the cases above.
+ */
+ scn->scn_phys.scn_errors++;
+ spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+ return (SET_ERROR(EINVAL));
}
return (0);
@@ -1947,12 +2312,11 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
* first 5; we want them to be useful.
*/
static void
-dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
dmu_objset_type_t ostype, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
- blkptr_t *bp_toread = NULL;
if (dsl_scan_check_suspend(scn, zb))
return;
@@ -1962,19 +2326,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
scn->scn_visited_this_txg++;
- /*
- * This debugging is commented out to conserve stack space. This
- * function is called recursively and the debugging adds several
- * bytes to the stack for each call. It can be commented back in
- * if required to debug an issue in dsl_scan_visitbp().
- *
- * dprintf_bp(bp,
- * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
- * ds, ds ? ds->ds_object : 0,
- * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
- * bp);
- */
-
if (BP_IS_HOLE(bp)) {
scn->scn_holes_this_txg++;
return;
@@ -1986,16 +2337,28 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
return;
}
- if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+ /*
+ * Check if this block contradicts any filesystem flags.
+ */
+ spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
+ ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+ f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+ if (f != SPA_FEATURE_NONE)
+ ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+ f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
+ if (f != SPA_FEATURE_NONE)
+ ASSERT(dsl_dataset_feature_is_active(ds, f));
+
+ if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
scn->scn_lt_min_this_txg++;
return;
}
- bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- *bp_toread = *bp;
-
- if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
- goto out;
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0)
+ return;
/*
* If dsl_scan_ddt() has already visited this block, it will have
@@ -2005,7 +2368,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
if (ddt_class_contains(dp->dp_spa,
scn->scn_phys.scn_ddt_class_max, bp)) {
scn->scn_ddt_contained_this_txg++;
- goto out;
+ return;
}
/*
@@ -2015,15 +2378,12 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
* Don't scan it now unless we need to because something
* under it was modified.
*/
- if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+ if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
scn->scn_gt_max_this_txg++;
- goto out;
+ return;
}
scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
-
-out:
- kmem_free(bp_toread, sizeof (blkptr_t));
}
static void
@@ -2340,7 +2700,6 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
dsl_scan_sync_state(scn, tx, SYNC_CACHED);
}
-/* ARGSUSED */
static int
enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
@@ -2366,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
return (err);
ds = prev;
}
+ mutex_enter(&scn->scn_queue_lock);
scan_ds_queue_insert(scn, ds->ds_object,
dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ mutex_exit(&scn->scn_queue_lock);
dsl_dataset_rele(ds, FTAG);
return (0);
}
@@ -2525,10 +2886,10 @@ out:
dsl_dataset_rele(ds, FTAG);
}
-/* ARGSUSED */
static int
enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
+ (void) arg;
dsl_dataset_t *ds;
int err;
dsl_scan_t *scn = dp->dp_scan;
@@ -2558,22 +2919,23 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
ds = prev;
}
+ mutex_enter(&scn->scn_queue_lock);
scan_ds_queue_insert(scn, ds->ds_object,
dsl_dataset_phys(ds)->ds_prev_snap_txg);
+ mutex_exit(&scn->scn_queue_lock);
dsl_dataset_rele(ds, FTAG);
return (0);
}
-/* ARGSUSED */
void
dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx)
{
+ (void) tx;
const ddt_key_t *ddk = &dde->dde_key;
ddt_phys_t *ddp = dde->dde_phys;
blkptr_t bp;
zbookmark_phys_t zb = { 0 };
- int p;
if (!dsl_scan_is_running(scn))
return;
@@ -2592,7 +2954,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
if (scn->scn_done_txg != 0)
return;
- for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
continue;
@@ -2609,7 +2971,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
* If there are N references to a deduped block, we don't want to scrub it
* N times -- ideally, we should scrub it exactly once.
*
- * We leverage the fact that the dde's replication class (enum ddt_class)
+ * We leverage the fact that the dde's replication class (ddt_class_t)
* is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
* (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
*
@@ -2640,12 +3002,10 @@ static void
dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
{
ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
- ddt_entry_t dde;
+ ddt_entry_t dde = {{{{0}}}};
int error;
uint64_t n = 0;
- bzero(&dde, sizeof (ddt_entry_t));
-
while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
ddt_t *ddt;
@@ -2708,7 +3068,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
dsl_scan_visit_rootbp(scn, NULL,
&dp->dp_meta_rootbp, tx);
- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
if (scn->scn_suspending)
return;
@@ -2738,7 +3097,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
* In case we suspended right at the end of the ds, zero the
* bookmark so we don't think that we're still trying to resume.
*/
- bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
+ memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t));
/*
* Keep pulling things out of the dataset avl queue. Updates to the
@@ -2777,8 +3136,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
}
static uint64_t
-dsl_scan_count_data_disks(vdev_t *rvd)
+dsl_scan_count_data_disks(spa_t *spa)
{
+ vdev_t *rvd = spa->spa_root_vdev;
uint64_t i, leaves = 0;
for (i = 0; i < rvd->vdev_children; i++) {
@@ -2820,12 +3180,13 @@ scan_io_queue_check_suspend(dsl_scan_t *scn)
uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
uint64_t sync_time_ns = curr_time_ns -
scn->scn_dp->dp_spa->spa_sync_starttime;
- int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
- int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ uint64_t dirty_min_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_min_dirty_percent / 100;
+ uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
return ((NSEC2MSEC(scan_time_ns) > mintime &&
- (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
txg_sync_waiting(scn->scn_dp) ||
NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
spa_shutting_down(scn->scn_dp->dp_spa));
@@ -2844,7 +3205,6 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
{
dsl_scan_t *scn = queue->q_scn;
scan_io_t *sio;
- int64_t bytes_issued = 0;
boolean_t suspended = B_FALSE;
while ((sio = list_head(io_list)) != NULL) {
@@ -2856,16 +3216,12 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
}
sio2bp(sio, &bp);
- bytes_issued += SIO_GET_ASIZE(sio);
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
&sio->sio_zb, queue);
(void) list_remove_head(io_list);
scan_io_queues_update_zio_stats(queue, &bp);
sio_free(sio);
}
-
- atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
-
return (suspended);
}
@@ -2910,6 +3266,8 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio);
+ if (avl_is_empty(&queue->q_sios_by_addr))
+ atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
queue->q_sio_memused -= SIO_GET_MUSED(sio);
bytes_issued += SIO_GET_ASIZE(sio);
@@ -2931,12 +3289,13 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
range_tree_resize_segment(queue->q_exts_by_addr, rs,
SIO_GET_OFFSET(sio), rs_get_end(rs,
queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
-
+ queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
return (B_TRUE);
} else {
uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
+ queue->q_last_ext_addr = -1;
return (B_FALSE);
}
}
@@ -2961,31 +3320,8 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
ASSERT(scn->scn_is_sorted);
- /* handle tunable overrides */
- if (scn->scn_checkpointing || scn->scn_clearing) {
- if (zfs_scan_issue_strategy == 1) {
- return (range_tree_first(rt));
- } else if (zfs_scan_issue_strategy == 2) {
- /*
- * We need to get the original entry in the by_addr
- * tree so we can modify it.
- */
- range_seg_t *size_rs =
- zfs_btree_first(&queue->q_exts_by_size, NULL);
- if (size_rs == NULL)
- return (NULL);
- uint64_t start = rs_get_start(size_rs, rt);
- uint64_t size = rs_get_end(size_rs, rt) - start;
- range_seg_t *addr_rs = range_tree_find(rt, start,
- size);
- ASSERT3P(addr_rs, !=, NULL);
- ASSERT3U(rs_get_start(size_rs, rt), ==,
- rs_get_start(addr_rs, rt));
- ASSERT3U(rs_get_end(size_rs, rt), ==,
- rs_get_end(addr_rs, rt));
- return (addr_rs);
- }
- }
+ if (!scn->scn_checkpointing && !scn->scn_clearing)
+ return (NULL);
/*
* During normal clearing, we want to issue our largest segments
@@ -2996,28 +3332,42 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
* so the way we are sorted now is as good as it will ever get.
* In this case, we instead switch to issuing extents in LBA order.
*/
- if (scn->scn_checkpointing) {
+ if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
+ zfs_scan_issue_strategy == 1)
return (range_tree_first(rt));
- } else if (scn->scn_clearing) {
- /*
- * We need to get the original entry in the by_addr
- * tree so we can modify it.
- */
- range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
- NULL);
- if (size_rs == NULL)
- return (NULL);
- uint64_t start = rs_get_start(size_rs, rt);
- uint64_t size = rs_get_end(size_rs, rt) - start;
- range_seg_t *addr_rs = range_tree_find(rt, start, size);
- ASSERT3P(addr_rs, !=, NULL);
- ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs,
- rt));
- ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt));
- return (addr_rs);
- } else {
- return (NULL);
+
+ /*
+ * Try to continue previous extent if it is not completed yet. After
+ * shrink in scan_io_queue_gather() it may no longer be the best, but
+ * otherwise we leave shorter remnant every txg.
+ */
+ uint64_t start;
+ uint64_t size = 1ULL << rt->rt_shift;
+ range_seg_t *addr_rs;
+ if (queue->q_last_ext_addr != -1) {
+ start = queue->q_last_ext_addr;
+ addr_rs = range_tree_find(rt, start, size);
+ if (addr_rs != NULL)
+ return (addr_rs);
}
+
+ /*
+ * Nothing to continue, so find new best extent.
+ */
+ uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
+ if (v == NULL)
+ return (NULL);
+ queue->q_last_ext_addr = start = *v << rt->rt_shift;
+
+ /*
+ * We need to get the original entry in the by_addr tree so we can
+ * modify it.
+ */
+ addr_rs = range_tree_find(rt, start, size);
+ ASSERT3P(addr_rs, !=, NULL);
+ ASSERT3U(rs_get_start(addr_rs, rt), ==, start);
+ ASSERT3U(rs_get_end(addr_rs, rt), >, start);
+ return (addr_rs);
}
static void
@@ -3026,15 +3376,19 @@ scan_io_queues_run_one(void *arg)
dsl_scan_io_queue_t *queue = arg;
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
boolean_t suspended = B_FALSE;
- range_seg_t *rs = NULL;
- scan_io_t *sio = NULL;
+ range_seg_t *rs;
+ scan_io_t *sio;
+ zio_t *zio;
list_t sio_list;
ASSERT(queue->q_scn->scn_is_sorted);
list_create(&sio_list, sizeof (scan_io_t),
offsetof(scan_io_t, sio_nodes.sio_list_node));
+ zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
+ NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
mutex_enter(q_lock);
+ queue->q_zio = zio;
/* Calculate maximum in-flight bytes for this vdev. */
queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
@@ -3049,12 +3403,12 @@ scan_io_queues_run_one(void *arg)
/* loop until we run out of time or sios */
while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
uint64_t seg_start = 0, seg_end = 0;
- boolean_t more_left = B_TRUE;
+ boolean_t more_left;
ASSERT(list_is_empty(&sio_list));
/* loop while we still have sios left to process in this rs */
- while (more_left) {
+ do {
scan_io_t *first_sio, *last_sio;
/*
@@ -3083,7 +3437,7 @@ scan_io_queues_run_one(void *arg)
if (suspended)
break;
- }
+ } while (more_left);
/* update statistics for debugging purposes */
scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
@@ -3096,12 +3450,12 @@ scan_io_queues_run_one(void *arg)
* If we were suspended in the middle of processing,
* requeue any unfinished sios and exit.
*/
- while ((sio = list_head(&sio_list)) != NULL) {
- list_remove(&sio_list, sio);
+ while ((sio = list_remove_head(&sio_list)) != NULL)
scan_io_queue_insert_impl(queue, sio);
- }
+ queue->q_zio = NULL;
mutex_exit(q_lock);
+ zio_nowait(zio);
list_destroy(&sio_list);
}
@@ -3122,7 +3476,7 @@ scan_io_queues_run(dsl_scan_t *scn)
ASSERT(scn->scn_is_sorted);
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
- if (scn->scn_bytes_pending == 0)
+ if (scn->scn_queues_pending == 0)
return;
if (scn->scn_taskq == NULL) {
@@ -3290,6 +3644,19 @@ dsl_scan_active(dsl_scan_t *scn)
return ((used != 0) || (clones_left));
}
+boolean_t
+dsl_errorscrub_active(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ if (spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+ if (dsl_errorscrubbing(scn->scn_dp))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
static boolean_t
dsl_scan_check_deferred(vdev_t *vd)
{
@@ -3439,11 +3806,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
scn->scn_dedup_frees_this_txg = 0;
/*
- * Write out changes to the DDT that may be required as a
- * result of the blocks freed. This ensures that the DDT
- * is clean when a scrub/resilver runs.
+ * Write out changes to the DDT and the BRT that may be required
+ * as a result of the blocks freed. This ensures that the DDT
+ * and the BRT are clean when a scrub/resilver runs.
*/
ddt_sync(spa, tx->tx_txg);
+ brt_sync(spa, tx->tx_txg);
}
if (err != 0)
return (err);
@@ -3505,6 +3873,387 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
return (0);
}
+static void
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
+{
+ zb->zb_objset = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_object = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+ *obj = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == '\0');
+}
+
+static void
+read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ dsl_dataset_t *ds;
+ objset_t *os;
+ if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
+ return;
+
+ if (dmu_objset_from_ds(ds, &os) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return;
+ }
+
+ /*
+ * If the key is not loaded dbuf_dnode_findbp() will error out with
+ * EACCES. However in that case dnode_hold() will eventually call
+ * dbuf_read()->zio_wait() which may call spa_log_error(). This will
+ * lead to a deadlock due to us holding the mutex spa_errlist_lock.
+ * Avoid this by checking here if the keys are loaded, if not return.
+ * If the keys are not loaded the head_errlog feature is meaningless
+ * as we cannot figure out the birth txg of the block pointer.
+ */
+ if (dsl_dataset_get_keystatus(ds->ds_dir) ==
+ ZFS_KEYSTATUS_UNAVAILABLE) {
+ dsl_dataset_rele(ds, FTAG);
+ return;
+ }
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return;
+ }
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
+ NULL);
+
+ if (error) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return;
+ }
+
+ if (!error && BP_IS_HOLE(&bp)) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+ return;
+ }
+
+ int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb.zb_level == ZB_ZIL_LEVEL)
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+ ASSERT(!BP_IS_EMBEDDED(&bp));
+ scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * We keep track of the scrubbed error blocks in "count". This will be used
+ * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
+ * function is modelled after check_filesystem().
+ */
+static int
+scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
+ int *count)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t latest_txg;
+ uint64_t txg_to_consider = spa->spa_syncing_txg;
+ boolean_t check_snapshot = B_TRUE;
+
+ error = find_birth_txg(ds, zep, &latest_txg);
+
+ /*
+ * If find_birth_txg() errors out, then err on the side of caution and
+ * proceed. In worst case scenario scrub all objects. If zep->zb_birth
+ * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
+ * scrub all objects.
+ */
+ if (error == 0 && zep->zb_birth == latest_txg) {
+ /* Block neither free nor re written. */
+ zbookmark_phys_t zb;
+ zep_to_zb(fs, zep, &zb);
+ scn->scn_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ /* We have already acquired the config lock for spa */
+ read_by_block_level(scn, zb);
+
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ scn->errorscrub_phys.dep_examined++;
+ scn->errorscrub_phys.dep_to_examine--;
+ (*count)++;
+ if ((*count) == zfs_scrub_error_blocks_per_txg ||
+ dsl_error_scrub_check_suspend(scn, &zb)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EFAULT));
+ }
+
+ check_snapshot = B_FALSE;
+ } else if (error == 0) {
+ txg_to_consider = latest_txg;
+ }
+
+ /*
+ * Retrieve the number of snapshots if the dataset is not a snapshot.
+ */
+ uint64_t snap_count = 0;
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+
+ error = zap_count(spa->spa_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+ }
+
+ if (snap_count == 0) {
+ /* Filesystem without snapshots. */
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+ }
+
+ uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ dsl_dataset_rele(ds, FTAG);
+
+ /* Check only snapshots created from this file system. */
+ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+ snap_obj_txg <= txg_to_consider) {
+
+ error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
+ snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dsl_dataset_rele(ds, FTAG);
+ continue;
+ }
+
+ boolean_t affected = B_TRUE;
+ if (check_snapshot) {
+ uint64_t blk_txg;
+ error = find_birth_txg(ds, zep, &blk_txg);
+
+ /*
+ * Scrub the snapshot also when zb_birth == 0 or when
+ * find_birth_txg() returns an error.
+ */
+ affected = (error == 0 && zep->zb_birth == blk_txg) ||
+ (error != 0) || (zep->zb_birth == 0);
+ }
+
+ /* Scrub snapshots. */
+ if (affected) {
+ zbookmark_phys_t zb;
+ zep_to_zb(snap_obj, zep, &zb);
+ scn->scn_zio_root = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ /* We have already acquired the config lock for spa */
+ read_by_block_level(scn, zb);
+
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ scn->errorscrub_phys.dep_examined++;
+ scn->errorscrub_phys.dep_to_examine--;
+ (*count)++;
+ if ((*count) == zfs_scrub_error_blocks_per_txg ||
+ dsl_error_scrub_check_suspend(scn, &zb)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (EFAULT);
+ }
+ }
+ snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsl_dataset_rele(ds, FTAG);
+ }
+ return (0);
+}
+
+void
+dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ /*
+ * Only process scans in sync pass 1.
+ */
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ /*
+ * If the spa is shutting down, then stop scanning. This will
+ * ensure that the scan does not dirty any new data during the
+ * shutdown phase.
+ */
+ if (spa_shutting_down(spa))
+ return;
+
+ if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
+ return;
+ }
+
+ if (dsl_scan_resilvering(scn->scn_dp)) {
+ /* cancel the error scrub if resilver started */
+ dsl_scan_cancel(scn->scn_dp);
+ return;
+ }
+
+ spa->spa_scrub_active = B_TRUE;
+ scn->scn_sync_start_time = gethrtime();
+
+ /*
+ * zfs_scan_suspend_progress can be set to disable scrub progress.
+ * See more detailed comment in dsl_scan_sync().
+ */
+ if (zfs_scan_suspend_progress) {
+ uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+ int mintime = zfs_scrub_min_time_ms;
+
+ while (zfs_scan_suspend_progress &&
+ !txg_sync_waiting(scn->scn_dp) &&
+ !spa_shutting_down(scn->scn_dp->dp_spa) &&
+ NSEC2MSEC(scan_time_ns) < mintime) {
+ delay(hz);
+ scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+ }
+ return;
+ }
+
+ int i = 0;
+ zap_attribute_t *za;
+ zbookmark_phys_t *zb;
+ boolean_t limit_exceeded = B_FALSE;
+
+ za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+ zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+ zap_cursor_advance(&scn->errorscrub_cursor)) {
+ name_to_bookmark(za->za_name, zb);
+
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ dsl_pool_config_enter(dp, FTAG);
+ read_by_block_level(scn, *zb);
+ dsl_pool_config_exit(dp, FTAG);
+
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ scn->errorscrub_phys.dep_examined += 1;
+ scn->errorscrub_phys.dep_to_examine -= 1;
+ i++;
+ if (i == zfs_scrub_error_blocks_per_txg ||
+ dsl_error_scrub_check_suspend(scn, zb)) {
+ limit_exceeded = B_TRUE;
+ break;
+ }
+ }
+
+ if (!limit_exceeded)
+ dsl_errorscrub_done(scn, B_TRUE, tx);
+
+ dsl_errorscrub_sync_state(scn, tx);
+ kmem_free(za, sizeof (*za));
+ kmem_free(zb, sizeof (*zb));
+ return;
+ }
+
+ int error = 0;
+ for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+ zap_cursor_advance(&scn->errorscrub_cursor)) {
+
+ zap_cursor_t *head_ds_cursor;
+ zap_attribute_t *head_ds_attr;
+ zbookmark_err_phys_t head_ds_block;
+
+ head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+ head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ uint64_t head_ds_err_obj = za->za_first_integer;
+ uint64_t head_ds;
+ name_to_object(za->za_name, &head_ds);
+ boolean_t config_held = B_FALSE;
+ uint64_t top_affected_fs;
+
+ for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
+ head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
+ head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
+
+ name_to_errphys(head_ds_attr->za_name, &head_ds_block);
+
+ /*
+ * In case we are called from spa_sync the pool
+ * config is already held.
+ */
+ if (!dsl_pool_config_held(dp)) {
+ dsl_pool_config_enter(dp, FTAG);
+ config_held = B_TRUE;
+ }
+
+ error = find_top_affected_fs(spa,
+ head_ds, &head_ds_block, &top_affected_fs);
+ if (error)
+ break;
+
+ error = scrub_filesystem(spa, top_affected_fs,
+ &head_ds_block, &i);
+
+ if (error == SET_ERROR(EFAULT)) {
+ limit_exceeded = B_TRUE;
+ break;
+ }
+ }
+
+ zap_cursor_fini(head_ds_cursor);
+ kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
+ kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+
+ if (config_held)
+ dsl_pool_config_exit(dp, FTAG);
+ }
+
+ kmem_free(za, sizeof (*za));
+ kmem_free(zb, sizeof (*zb));
+ if (!limit_exceeded)
+ dsl_errorscrub_done(scn, B_TRUE, tx);
+
+ dsl_errorscrub_sync_state(scn, tx);
+}
+
/*
* This is the primary entry point for scans that is called from syncing
* context. Scans must happen entirely during syncing context so that we
@@ -3608,8 +4357,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
*/
if (zfs_scan_suspend_progress) {
uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
- int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
- zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+ uint_t mintime = (scn->scn_phys.scn_func ==
+ POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms :
+ zfs_scrub_min_time_ms;
while (zfs_scan_suspend_progress &&
!txg_sync_waiting(scn->scn_dp) &&
@@ -3622,6 +4372,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
}
/*
+ * Disabled by default, set zfs_scan_report_txgs to report
+ * average performance over the last zfs_scan_report_txgs TXGs.
+ */
+ if (zfs_scan_report_txgs != 0 &&
+ tx->tx_txg % zfs_scan_report_txgs == 0) {
+ scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
+ spa_scan_stat_init(spa);
+ }
+
+ /*
* It is possible to switch from unsorted to sorted at any time,
* but afterwards the scan will remain sorted unless reloaded from
* a checkpoint after a reboot.
@@ -3680,12 +4440,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
taskqid_t prefetch_tqid;
/*
- * Recalculate the max number of in-flight bytes for pool-wide
- * scanning operations (minimum 1MB). Limits for the issuing
- * phase are done per top-level vdev and are handled separately.
+ * Calculate the max number of in-flight bytes for pool-wide
+ * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
+ * Limits for the issuing phase are done per top-level vdev and
+ * are handled separately.
*/
- scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
- dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
+ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
+ zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
if (scnp->scn_ddt_bookmark.ddb_class <=
scnp->scn_ddt_class_max) {
@@ -3749,12 +4510,15 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
if (scn->scn_is_sorted) {
scn->scn_checkpointing = B_TRUE;
scn->scn_clearing = B_TRUE;
+ scn->scn_issued_before_pass +=
+ spa->spa_scan_pass_issued;
+ spa_scan_stat_init(spa);
}
zfs_dbgmsg("scan complete for %s txg %llu",
spa->spa_name,
(longlong_t)tx->tx_txg);
}
- } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+ } else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
ASSERT(scn->scn_clearing);
/* need to issue scrubbing IOs from per-vdev queues */
@@ -3784,7 +4548,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
spa->spa_name);
ASSERT3U(scn->scn_done_txg, !=, 0);
ASSERT0(spa->spa_scrub_inflight);
- ASSERT0(scn->scn_bytes_pending);
+ ASSERT0(scn->scn_queues_pending);
dsl_scan_done(scn, B_TRUE, tx);
sync_type = SYNC_MANDATORY;
}
@@ -3793,10 +4557,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
}
static void
-count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
+count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
{
- int i;
-
/*
* Don't count embedded bp's, since we already did the work of
* scanning these when we scanned the containing block.
@@ -3811,18 +4573,22 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
* zio code will only try the first one unless there is an issue.
* Therefore, we should only count the first DVA for these IOs.
*/
- if (scn->scn_is_sorted) {
- atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
- DVA_GET_ASIZE(&bp->blk_dva[0]));
- } else {
- spa_t *spa = scn->scn_dp->dp_spa;
+ atomic_add_64(&spa->spa_scan_pass_issued,
+ all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
+}
- for (i = 0; i < BP_GET_NDVAS(bp); i++) {
- atomic_add_64(&spa->spa_scan_pass_issued,
- DVA_GET_ASIZE(&bp->blk_dva[i]));
- }
- }
+static void
+count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
+{
+ if (BP_IS_EMBEDDED(bp))
+ return;
+ atomic_add_64(&scn->scn_phys.scn_skipped,
+ all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
+}
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
/*
* If we resume after a reboot, zab will be NULL; don't record
* incomplete stats in that case.
@@ -3830,9 +4596,7 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
if (zab == NULL)
return;
- mutex_enter(&zab->zab_lock);
-
- for (i = 0; i < 4; i++) {
+ for (int i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
@@ -3867,28 +4631,27 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
break;
}
}
-
- mutex_exit(&zab->zab_lock);
}
static void
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{
avl_index_t idx;
- int64_t asize = SIO_GET_ASIZE(sio);
dsl_scan_t *scn = queue->q_scn;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+ if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
+ atomic_add_64(&scn->scn_queues_pending, 1);
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
/* block is already scheduled for reading */
- atomic_add_64(&scn->scn_bytes_pending, -asize);
sio_free(sio);
return;
}
avl_insert(&queue->q_sios_by_addr, sio, idx);
queue->q_sio_memused += SIO_GET_MUSED(sio);
- range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
+ range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
+ SIO_GET_ASIZE(sio));
}
/*
@@ -3901,7 +4664,6 @@ static void
scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
int zio_flags, const zbookmark_phys_t *zb)
{
- dsl_scan_t *scn = queue->q_scn;
scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
ASSERT0(BP_IS_GANG(bp));
@@ -3911,13 +4673,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
sio->sio_flags = zio_flags;
sio->sio_zb = *zb;
- /*
- * Increment the bytes pending counter now so that we can't
- * get an integer underflow in case the worker processes the
- * zio before we get to incrementing this counter.
- */
- atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
-
+ queue->q_last_ext_addr = -1;
scan_io_queue_insert_impl(queue, sio);
}
@@ -3967,15 +4723,15 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
- uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ uint64_t phys_birth = BP_GET_BIRTH(bp);
size_t psize = BP_GET_PSIZE(bp);
boolean_t needs_io = B_FALSE;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-
+ count_block(dp->dp_blkstats, bp);
if (phys_birth <= scn->scn_phys.scn_min_txg ||
phys_birth >= scn->scn_phys.scn_max_txg) {
- count_block(scn, dp->dp_blkstats, bp);
+ count_block_skipped(scn, bp, B_TRUE);
return (0);
}
@@ -4003,8 +4759,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
* Keep track of how much data we've examined so that
* zpool(8) status can make useful progress reports.
*/
- scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
- spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
+ uint64_t asize = DVA_GET_ASIZE(dva);
+ scn->scn_phys.scn_examined += asize;
+ spa->spa_scan_pass_exam += asize;
/* if it's a resilver, this may not be in the target range */
if (!needs_io)
@@ -4015,7 +4772,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (needs_io && !zfs_no_scrub_io) {
dsl_scan_enqueue(dp, bp, zio_flags, zb);
} else {
- count_block(scn, dp->dp_blkstats, bp);
+ count_block_skipped(scn, bp, B_TRUE);
}
/* do not relocate this block */
@@ -4047,7 +4804,14 @@ dsl_scan_scrub_done(zio_t *zio)
if (zio->io_error && (zio->io_error != ECKSUM ||
!(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
- atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+ if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
+ !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
+ atomic_inc_64(&spa->spa_dsl_pool->dp_scan
+ ->errorscrub_phys.dep_errors);
+ } else {
+ atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
+ .scn_errors);
+ }
}
}
@@ -4066,6 +4830,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
dsl_scan_t *scn = dp->dp_scan;
size_t size = BP_GET_PSIZE(bp);
abd_t *data = abd_alloc_for_io(size, B_FALSE);
+ zio_t *pio;
if (queue == NULL) {
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
@@ -4074,6 +4839,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
mutex_exit(&spa->spa_scrub_lock);
+ pio = scn->scn_zio_root;
} else {
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
@@ -4082,12 +4848,14 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
cv_wait(&queue->q_zio_cv, q_lock);
queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+ pio = queue->q_zio;
mutex_exit(q_lock);
}
- count_block(scn, dp->dp_blkstats, bp);
- zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
- dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+ ASSERT(pio != NULL);
+ count_block_issued(spa, bp, queue == NULL);
+ zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
+ queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/*
@@ -4121,33 +4889,93 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
* extents that are more completely filled (in a 3:2 ratio) vs just larger.
* Note that as an optimization, we replace multiplication and division by
* 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
+ *
+ * Since we do not care if one extent is only few percent better than another,
+ * compress the score into 6 bits via binary logarithm AKA highbit64() and
+ * put into otherwise unused due to ashift high bits of offset. This allows
+ * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
+ * with single operation. Plus it makes scrubs more sequential and reduces
+ * chances that minor extent change move it within the B-tree.
*/
+__attribute__((always_inline)) inline
static int
ext_size_compare(const void *x, const void *y)
{
- const range_seg_gap_t *rsa = x, *rsb = y;
+ const uint64_t *a = x, *b = y;
- uint64_t sa = rsa->rs_end - rsa->rs_start;
- uint64_t sb = rsb->rs_end - rsb->rs_start;
- uint64_t score_a, score_b;
+ return (TREE_CMP(*a, *b));
+}
- score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
- fill_weight * rsa->rs_fill) >> 7);
- score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
- fill_weight * rsb->rs_fill) >> 7);
+ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
+ ext_size_compare)
- if (score_a > score_b)
- return (-1);
- if (score_a == score_b) {
- if (rsa->rs_start < rsb->rs_start)
- return (-1);
- if (rsa->rs_start == rsb->rs_start)
- return (0);
- return (1);
- }
- return (1);
+static void
+ext_size_create(range_tree_t *rt, void *arg)
+{
+ (void) rt;
+ zfs_btree_t *size_tree = arg;
+
+ zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
+ sizeof (uint64_t));
}
+static void
+ext_size_destroy(range_tree_t *rt, void *arg)
+{
+ (void) rt;
+ zfs_btree_t *size_tree = arg;
+ ASSERT0(zfs_btree_numnodes(size_tree));
+
+ zfs_btree_destroy(size_tree);
+}
+
+static uint64_t
+ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg)
+{
+ (void) rt;
+ uint64_t size = rsg->rs_end - rsg->rs_start;
+ uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
+ fill_weight * rsg->rs_fill) >> 7);
+ ASSERT3U(rt->rt_shift, >=, 8);
+ return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
+}
+
+static void
+ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+ ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
+ uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
+ zfs_btree_add(size_tree, &v);
+}
+
+static void
+ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+ ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
+ uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
+ zfs_btree_remove(size_tree, &v);
+}
+
+static void
+ext_size_vacate(range_tree_t *rt, void *arg)
+{
+ zfs_btree_t *size_tree = arg;
+ zfs_btree_clear(size_tree);
+ zfs_btree_destroy(size_tree);
+
+ ext_size_create(rt, arg);
+}
+
+static const range_tree_ops_t ext_size_ops = {
+ .rtop_create = ext_size_create,
+ .rtop_destroy = ext_size_destroy,
+ .rtop_add = ext_size_add,
+ .rtop_remove = ext_size_remove,
+ .rtop_vacate = ext_size_vacate
+};
+
/*
* Comparator for the q_sios_by_addr tree. Sorting is simply performed
* based on LBA-order (from lowest to highest).
@@ -4170,9 +4998,10 @@ scan_io_queue_create(vdev_t *vd)
q->q_scn = scn;
q->q_vd = vd;
q->q_sio_memused = 0;
+ q->q_last_ext_addr = -1;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
- q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP,
- &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap);
+ q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP,
+ &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap);
avl_create(&q->q_sios_by_addr, sio_addr_compare,
sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
@@ -4190,21 +5019,20 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
dsl_scan_t *scn = queue->q_scn;
scan_io_t *sio;
void *cookie = NULL;
- int64_t bytes_dequeued = 0;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+ if (!avl_is_empty(&queue->q_sios_by_addr))
+ atomic_add_64(&scn->scn_queues_pending, -1);
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
NULL) {
ASSERT(range_tree_contains(queue->q_exts_by_addr,
SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
- bytes_dequeued += SIO_GET_ASIZE(sio);
queue->q_sio_memused -= SIO_GET_MUSED(sio);
sio_free(sio);
}
ASSERT0(queue->q_sio_memused);
- atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
range_tree_destroy(queue->q_exts_by_addr);
avl_destroy(&queue->q_sios_by_addr);
@@ -4300,28 +5128,22 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
sio_free(srch_sio);
if (sio != NULL) {
- int64_t asize = SIO_GET_ASIZE(sio);
blkptr_t tmpbp;
/* Got it while it was cold in the queue */
ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
- ASSERT3U(size, ==, asize);
+ ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
avl_remove(&queue->q_sios_by_addr, sio);
+ if (avl_is_empty(&queue->q_sios_by_addr))
+ atomic_add_64(&scn->scn_queues_pending, -1);
queue->q_sio_memused -= SIO_GET_MUSED(sio);
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
- /*
- * We only update scn_bytes_pending in the cold path,
- * otherwise it will already have been accounted for as
- * part of the zio's execution.
- */
- atomic_add_64(&scn->scn_bytes_pending, -asize);
-
- /* count the block as though we issued it */
+ /* count the block as though we skipped it */
sio2bp(sio, &tmpbp);
- count_block(scn, dp->dp_blkstats, &tmpbp);
+ count_block_skipped(scn, &tmpbp, B_FALSE);
sio_free(sio);
}
@@ -4379,20 +5201,19 @@ dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW,
"Max bytes in flight per leaf vdev for scrubs and resilvers");
-ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW,
"Min millisecs to scrub per txg");
-ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW,
"Min millisecs to obsolete per txg");
-ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW,
"Min millisecs to free per txg");
-ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW,
"Min millisecs to resilver per txg");
ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
@@ -4404,40 +5225,48 @@ ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
"Set to disable scrub prefetching");
-ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
"Max number of blocks freed in one txg");
-ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
"Max number of dedup blocks freed in one txg");
ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
"Enable processing of the free_bpobj");
-ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW,
+ "Enable block statistics calculation during scrub");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW,
"Fraction of RAM for scan hard limit");
-ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW,
- "IO issuing strategy during scrubbing. "
- "0 = default, 1 = LBA, 2 = size");
+ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW,
+ "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
"Scrub using legacy non-sequential method");
-ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW,
"Scan progress on-disk checkpointing interval");
-ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW,
"Max gap in bytes between sequential scrub / resilver I/Os");
-ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW,
"Fraction of hard limit used as soft limit");
ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
"Tunable to attempt to reduce lock contention");
-ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW,
"Tunable to adjust bias towards more filled segments during scans");
+ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
+ "Tunable to report resilver performance over the last N txgs");
+
ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
"Process all resilvers immediately");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW,
+ "Error blocks to be scrubbed in one txg");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_synctask.c b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
index 148e8fff2437..409e12884d91 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_synctask.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -32,10 +32,10 @@
#define DST_AVG_BLKSHIFT 14
-/* ARGSUSED */
static int
dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
{
+ (void) arg, (void) tx;
return (0);
}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_userhold.c b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
index 75d153194a00..75953f70f926 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_userhold.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -115,7 +115,7 @@ dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
dsl_dataset_t *ds;
int error = 0;
- char *htag, *name;
+ const char *htag, *name;
/* must be a snapshot */
name = nvpair_name(pair);
@@ -346,7 +346,7 @@ dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
return (ret);
}
-typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, const void *tag,
dsl_dataset_t **dsp);
typedef struct dsl_dataset_user_release_arg {
@@ -359,7 +359,7 @@ typedef struct dsl_dataset_user_release_arg {
/* Place a dataset hold on the snapshot identified by passed dsobj string */
static int
-dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, const void *tag,
dsl_dataset_t **dsp)
{
return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp));
@@ -572,7 +572,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
{
dsl_dataset_user_release_arg_t ddura;
nvpair_t *pair;
- char *pool;
+ const char *pool;
int error;
pair = nvlist_next_nvpair(holds, NULL);
diff --git a/sys/contrib/openzfs/module/zfs/edonr_zfs.c b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
index aa00e1c9417e..db21c9cf197c 100644
--- a/sys/contrib/openzfs/module/zfs/edonr_zfs.c
+++ b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
@@ -45,7 +45,6 @@ edonr_incremental(void *buf, size_t size, void *arg)
/*
* Native zio_checksum interface for the Edon-R hash function.
*/
-/*ARGSUSED*/
void
abd_checksum_edonr_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
@@ -54,10 +53,10 @@ abd_checksum_edonr_native(abd_t *abd, uint64_t size,
EdonRState ctx;
ASSERT(ctx_template != NULL);
- bcopy(ctx_template, &ctx, sizeof (ctx));
+ memcpy(&ctx, ctx_template, sizeof (ctx));
(void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
EdonRFinal(&ctx, digest);
- bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+ memcpy(zcp->zc_word, digest, sizeof (zcp->zc_word));
}
/*
@@ -89,18 +88,17 @@ abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
* size by double-hashing it (the new salt block will be composed of
* H(salt) || H(H(salt))).
*/
- CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
- EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
- salt_block);
- EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
- EDONR_MODE / 8);
+ _Static_assert(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8),
+ "Edon-R block size mismatch");
+ EdonRHash(salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, salt_block);
+ EdonRHash(salt_block, EDONR_MODE, salt_block + EDONR_MODE / 8);
/*
* Feed the new salt block into the hash function - this will serve
* as our MAC key.
*/
ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
- EdonRInit(ctx, EDONR_MODE);
+ EdonRInit(ctx);
EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
return (ctx);
}
@@ -108,8 +106,8 @@ abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
void
abd_checksum_edonr_tmpl_free(void *ctx_template)
{
- EdonRState *ctx = ctx_template;
+ EdonRState *ctx = ctx_template;
- bzero(ctx, sizeof (*ctx));
+ memset(ctx, 0, sizeof (*ctx));
kmem_free(ctx, sizeof (*ctx));
}
diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c
index b8a1c7c8a5ca..77d87b694a43 100644
--- a/sys/contrib/openzfs/module/zfs/fm.c
+++ b/sys/contrib/openzfs/module/zfs/fm.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -68,9 +68,9 @@
#include <sys/condvar.h>
#include <sys/zfs_ioctl.h>
-int zfs_zevent_len_max = 512;
+static uint_t zfs_zevent_len_max = 512;
-static int zevent_len_cur = 0;
+static uint_t zevent_len_cur = 0;
static int zevent_waiters = 0;
static int zevent_flags = 0;
@@ -148,8 +148,7 @@ zfs_zevent_drain(zevent_t *ev)
list_remove(&zevent_list, ev);
/* Remove references to this event in all private file data */
- while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
- list_remove(&ev->ev_ze_list, ze);
+ while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) {
ze->ze_zevent = NULL;
ze->ze_dropped++;
}
@@ -158,7 +157,7 @@ zfs_zevent_drain(zevent_t *ev)
}
void
-zfs_zevent_drain_all(int *count)
+zfs_zevent_drain_all(uint_t *count)
{
zevent_t *ev;
@@ -380,8 +379,7 @@ zfs_zevent_wait(zfs_zevent_t *ze)
break;
}
- error = cv_wait_sig(&zevent_cv, &zevent_lock);
- if (signal_pending(current)) {
+ if (cv_wait_sig(&zevent_cv, &zevent_lock) == 0) {
error = SET_ERROR(EINTR);
break;
} else if (!list_is_empty(&zevent_list)) {
@@ -483,21 +481,21 @@ zfs_zevent_destroy(zfs_zevent_t *ze)
/*
* Wrappers for FM nvlist allocators
*/
-/* ARGSUSED */
static void *
i_fm_alloc(nv_alloc_t *nva, size_t size)
{
- return (kmem_zalloc(size, KM_SLEEP));
+ (void) nva;
+ return (kmem_alloc(size, KM_SLEEP));
}
-/* ARGSUSED */
static void
i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
{
+ (void) nva;
kmem_free(buf, size);
}
-const nv_alloc_ops_t fm_mem_alloc_ops = {
+static const nv_alloc_ops_t fm_mem_alloc_ops = {
.nv_ao_init = NULL,
.nv_ao_fini = NULL,
.nv_ao_alloc = i_fm_alloc,
@@ -702,7 +700,7 @@ i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
case DATA_TYPE_STRING_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_string_array(payload, name,
- va_arg(ap, char **), nelem);
+ va_arg(ap, const char **), nelem);
break;
case DATA_TYPE_NVLIST:
ret = nvlist_add_nvlist(payload, name,
@@ -711,7 +709,7 @@ i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
case DATA_TYPE_NVLIST_ARRAY:
nelem = va_arg(ap, int);
ret = nvlist_add_nvlist_array(payload, name,
- va_arg(ap, nvlist_t **), nelem);
+ va_arg(ap, const nvlist_t **), nelem);
break;
default:
ret = EINVAL;
@@ -867,8 +865,10 @@ fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
}
va_end(ap);
- if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST,
+ (const nvlist_t **)pairs, npairs) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+ }
for (i = 0; i < npairs; i++)
fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
@@ -891,7 +891,7 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
uint_t n;
int i, j;
va_list ap;
- char *hcname, *hcid;
+ const char *hcname, *hcid;
if (!fm_fmri_hc_set_common(fmri, version, auth))
return;
@@ -953,6 +953,7 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
}
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
+ va_end(ap);
return;
}
}
@@ -961,8 +962,8 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
/*
* Create the fmri hc list
*/
- if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
- npairs + n) != 0) {
+ if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST,
+ (const nvlist_t **)pairs, npairs + n) != 0) {
atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
@@ -1128,7 +1129,7 @@ fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
if (serial != NULL) {
if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
- (char **)&serial, 1) != 0) {
+ (const char **)&serial, 1) != 0) {
atomic_inc_64(
&erpt_kstat_data.fmri_set_failed.value.ui64);
}
@@ -1340,7 +1341,7 @@ fm_init(void)
void
fm_fini(void)
{
- int count;
+ uint_t count;
zfs_ereport_fini();
@@ -1352,7 +1353,7 @@ fm_fini(void)
zevent_flags |= ZEVENT_SHUTDOWN;
while (zevent_waiters > 0) {
mutex_exit(&zevent_lock);
- schedule();
+ kpreempt(KPREEMPT_SYNC);
mutex_enter(&zevent_lock);
}
mutex_exit(&zevent_lock);
@@ -1368,5 +1369,5 @@ fm_fini(void)
}
#endif /* _KERNEL */
-ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, UINT, ZMOD_RW,
"Max event queue length");
diff --git a/sys/contrib/openzfs/module/zfs/gzip.c b/sys/contrib/openzfs/module/zfs/gzip.c
index e2c6e59969d6..f3b19446352a 100644
--- a/sys/contrib/openzfs/module/zfs/gzip.c
+++ b/sys/contrib/openzfs/module/zfs/gzip.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -28,7 +28,6 @@
#include <sys/debug.h>
#include <sys/types.h>
-#include <sys/strings.h>
#include <sys/qat.h>
#include <sys/zio_compress.h>
@@ -66,7 +65,7 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
if (d_len != s_len)
return (s_len);
- bcopy(s_start, d_start, s_len);
+ memcpy(d_start, s_start, s_len);
return (s_len);
}
/* if hardware compression fails, do it again with software */
@@ -76,17 +75,17 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
if (d_len != s_len)
return (s_len);
- bcopy(s_start, d_start, s_len);
+ memcpy(d_start, s_start, s_len);
return (s_len);
}
return ((size_t)dstlen);
}
-/*ARGSUSED*/
int
gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
{
+ (void) n;
zlen_t dstlen = d_len;
ASSERT(d_len >= s_len);
diff --git a/sys/contrib/openzfs/module/zfs/hkdf.c b/sys/contrib/openzfs/module/zfs/hkdf.c
index 14265472df7d..580544c8ac1a 100644
--- a/sys/contrib/openzfs/module/zfs/hkdf.c
+++ b/sys/contrib/openzfs/module/zfs/hkdf.c
@@ -36,7 +36,6 @@ hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
mech.cm_param_len = 0;
/* initialize the salt as a crypto key */
- key.ck_format = CRYPTO_KEY_RAW;
key.ck_length = CRYPTO_BYTES2BITS(salt_len);
key.ck_data = salt;
@@ -53,7 +52,7 @@ hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
output_cd.cd_raw.iov_base = (char *)out_buf;
output_cd.cd_raw.iov_len = output_cd.cd_length;
- ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL);
+ ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd);
if (ret != CRYPTO_SUCCESS)
return (SET_ERROR(EIO));
@@ -83,7 +82,6 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
mech.cm_param_len = 0;
/* initialize the salt as a crypto key */
- key.ck_format = CRYPTO_KEY_RAW;
key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
key.ck_data = extract_key;
@@ -110,19 +108,19 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
T_cd.cd_length = T_len;
T_cd.cd_raw.iov_len = T_cd.cd_length;
- ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL);
+ ret = crypto_mac_init(&mech, &key, NULL, &ctx);
if (ret != CRYPTO_SUCCESS)
return (SET_ERROR(EIO));
- ret = crypto_mac_update(ctx, &T_cd, NULL);
+ ret = crypto_mac_update(ctx, &T_cd);
if (ret != CRYPTO_SUCCESS)
return (SET_ERROR(EIO));
- ret = crypto_mac_update(ctx, &info_cd, NULL);
+ ret = crypto_mac_update(ctx, &info_cd);
if (ret != CRYPTO_SUCCESS)
return (SET_ERROR(EIO));
- ret = crypto_mac_update(ctx, &c_cd, NULL);
+ ret = crypto_mac_update(ctx, &c_cd);
if (ret != CRYPTO_SUCCESS)
return (SET_ERROR(EIO));
@@ -130,11 +128,11 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
T_cd.cd_length = T_len;
T_cd.cd_raw.iov_len = T_cd.cd_length;
- ret = crypto_mac_final(ctx, &T_cd, NULL);
+ ret = crypto_mac_final(ctx, &T_cd);
if (ret != CRYPTO_SUCCESS)
return (SET_ERROR(EIO));
- bcopy(T, out_buf + pos,
+ memcpy(out_buf + pos, T,
(i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
pos += SHA512_DIGEST_LENGTH;
}
diff --git a/sys/contrib/openzfs/module/zfs/lz4.c b/sys/contrib/openzfs/module/zfs/lz4.c
index 9da9d9e00635..75a31bf17ea4 100644
--- a/sys/contrib/openzfs/module/zfs/lz4.c
+++ b/sys/contrib/openzfs/module/zfs/lz4.c
@@ -1,165 +1,50 @@
/*
- * LZ4 - Fast LZ compression algorithm
- * Header File
- * Copyright (C) 2011-2013, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
+ LZ4 - Fast LZ compression algorithm
+ Copyright (C) 2011-present, Yann Collet.
+
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - LZ4 homepage : http://www.lz4.org
+ - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*
+ * This file contains unmodified code from lz4 1.9.3's decompressor, plus
+ * associated macros and constants.
*
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * It also contains a couple of defines from the old lz4.c to make things
+ * fit together smoothly.
*
- * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
*/
#include <sys/zfs_context.h>
-#include <sys/zio_compress.h>
-static int real_LZ4_compress(const char *source, char *dest, int isize,
- int osize);
-static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
int isize, int maxOutputSize);
-static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
- int isize, int osize);
-static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
- int isize, int osize);
-
-static void *lz4_alloc(int flags);
-static void lz4_free(void *ctx);
-
-/*ARGSUSED*/
-size_t
-lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
- size_t d_len, int n)
-{
- uint32_t bufsiz;
- char *dest = d_start;
-
- ASSERT(d_len >= sizeof (bufsiz));
-
- bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
- d_len - sizeof (bufsiz));
-
- /* Signal an error if the compression routine returned zero. */
- if (bufsiz == 0)
- return (s_len);
-
- /*
- * The exact compressed size is needed by the decompression routine,
- * so it is stored at the start of the buffer. Note that this may be
- * less than the compressed block size, which is rounded up to a
- * multiple of 1<<ashift.
- */
- *(uint32_t *)dest = BE_32(bufsiz);
-
- return (bufsiz + sizeof (bufsiz));
-}
-
-/*ARGSUSED*/
-int
-lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
- size_t d_len, int n)
-{
- const char *src = s_start;
- uint32_t bufsiz = BE_IN32(src);
-
- /* invalid compressed buffer size encoded at start */
- if (bufsiz + sizeof (bufsiz) > s_len)
- return (1);
-
- /*
- * Returns 0 on success (decompression function returned non-negative)
- * and non-zero on failure (decompression function returned negative).
- */
- return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
- d_start, bufsiz, d_len) < 0);
-}
-
-/*
- * LZ4 API Description:
- *
- * Simple Functions:
- * real_LZ4_compress() :
- * isize : is the input size. Max supported value is ~1.9GB
- * return : the number of bytes written in buffer dest
- * or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
- * note : destination buffer must be already allocated.
- * destination buffer must be sized to handle worst cases
- * situations (input data not compressible) worst case size
- * evaluation is provided by function LZ4_compressBound().
- *
- * real_LZ4_uncompress() :
- * osize : is the output size, therefore the original size
- * return : the number of bytes read in the source buffer.
- * If the source stream is malformed, the function will stop
- * decoding and return a negative result, indicating the byte
- * position of the faulty instruction. This function never
- * writes beyond dest + osize, and is therefore protected
- * against malicious data packets.
- * note : destination buffer must be already allocated
- * note : real_LZ4_uncompress() is not used in ZFS so its code
- * is not present here.
- *
- * Advanced Functions
- *
- * LZ4_compressBound() :
- * Provides the maximum size that LZ4 may output in a "worst case"
- * scenario (input data not compressible) primarily useful for memory
- * allocation of output buffer.
- *
- * isize : is the input size. Max supported value is ~1.9GB
- * return : maximum output size in a "worst case" scenario
- * note : this function is limited by "int" range (2^31-1)
- *
- * LZ4_uncompress_unknownOutputSize() :
- * isize : is the input size, therefore the compressed size
- * maxOutputSize : is the size of the destination buffer (which must be
- * already allocated)
- * return : the number of bytes decoded in the destination buffer
- * (necessarily <= maxOutputSize). If the source stream is
- * malformed, the function will stop decoding and return a
- * negative result, indicating the byte position of the faulty
- * instruction. This function never writes beyond dest +
- * maxOutputSize, and is therefore protected against malicious
- * data packets.
- * note : Destination buffer must be already allocated.
- * This version is slightly slower than real_LZ4_uncompress()
- *
- * LZ4_compressCtx() :
- * This function explicitly handles the CTX memory structure.
- *
- * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
- * by the caller (either on the stack or using kmem_cache_alloc). Passing
- * NULL isn't valid.
- *
- * LZ4_compress64kCtx() :
- * Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
- * isize *Must* be <64KB, otherwise the output will be corrupted.
- *
- * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
- * by the caller (either on the stack or using kmem_cache_alloc). Passing
- * NULL isn't valid.
- */
/*
* Tuning parameters
@@ -186,26 +71,6 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
#define NOTCOMPRESSIBLE_CONFIRMATION 6
/*
- * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
- * performance for big endian cpu, but the resulting compressed stream
- * will be incompatible with little-endian CPU. You can set this option
- * to 1 in situations where data will stay within closed environment.
- * This option is useless on Little_Endian CPU (such as x86).
- */
-/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
-
-/*
- * CPU Feature Detection
- */
-
-/* 32 or 64 bits ? */
-#if defined(_LP64)
-#define LZ4_ARCH64 1
-#else
-#define LZ4_ARCH64 0
-#endif
-
-/*
* Little Endian or Big Endian?
* Note: overwrite the below #define if you know your architecture endianness.
*/
@@ -219,25 +84,44 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
#undef LZ4_BIG_ENDIAN
#endif
-/*
- * Unaligned memory access is automatically enabled for "common" CPU,
- * such as x86. For others CPU, the compiler will be more cautious, and
- * insert extra code to ensure aligned access is respected. If you know
- * your target CPU supports unaligned memory access, you may want to
- * force this option manually to improve performance
+/*-************************************
+* CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ * It can generate buggy code on targets which assembly generation depends on alignment.
+ * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
*/
-#if defined(__ARM_FEATURE_UNALIGNED)
-#define LZ4_FORCE_UNALIGNED_ACCESS 1
+#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */
+# if defined(__GNUC__) && \
+ ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+ || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+# define LZ4_FORCE_MEMORY_ACCESS 2
+# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__)
+# define LZ4_FORCE_MEMORY_ACCESS 1
+# endif
#endif
/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+/*
* Illumos : we can't use GCC's __builtin_ctz family of builtins in the
* kernel
* Linux : we can use GCC's __builtin_ctz family of builtins in the
* kernel
*/
#undef LZ4_FORCE_SW_BITCOUNT
-#if defined(__sparc)
+#if defined(__sunos__)
#define LZ4_FORCE_SW_BITCOUNT
#endif
@@ -257,10 +141,50 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
-#define expect(expr, value) (__builtin_expect((expr), (value)))
+#ifndef LZ4_FORCE_INLINE
+# ifdef _MSC_VER /* Visual Studio */
+# define LZ4_FORCE_INLINE static __forceinline
+# else
+# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
+# ifdef __GNUC__
+# define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+# else
+# define LZ4_FORCE_INLINE static inline
+# endif
+# else
+# define LZ4_FORCE_INLINE static
+# endif /* __STDC_VERSION__ */
+# endif /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+# define LZ4_FORCE_O2 __attribute__((optimize("O2")))
+# undef LZ4_FORCE_INLINE
+# define LZ4_FORCE_INLINE static __inline __attribute__((optimize("O2"),always_inline))
#else
-#define expect(expr, value) (expr)
+# define LZ4_FORCE_O2
+#endif
+
+#ifndef expect
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
+# define expect(expr,value) (__builtin_expect ((expr),(value)) )
+#else
+# define expect(expr,value) (expr)
+#endif
#endif
#ifndef likely
@@ -271,814 +195,793 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
#define unlikely(expr) expect((expr) != 0, 0)
#endif
-#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
- (((x) & 0xffu) << 8)))
+#ifndef _KERNEL
+#include <stdlib.h> /* malloc, calloc, free */
+#include <string.h> /* memset, memcpy */
+#endif
+#define ALLOC(s) malloc(s)
+#define ALLOC_AND_ZERO(s) calloc(1,s)
+#define FREEMEM(p) free(p)
+
+#define MEM_INIT(p,v,s) memset((p),(v),(s))
+
+
+/*-************************************
+* Common Constants
+**************************************/
+#define MINMATCH 4
-/* Basic types */
-#define BYTE uint8_t
-#define U16 uint16_t
-#define U32 uint32_t
-#define S32 int32_t
-#define U64 uint64_t
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS 5 /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE ((2*WILDCOPYLENGTH) - MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
-#ifndef LZ4_FORCE_UNALIGNED_ACCESS
-#pragma pack(1)
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at compile time */
+# define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
#endif
-typedef struct _U16_S {
- U16 v;
-} U16_S;
-typedef struct _U32_S {
- U32 v;
-} U32_S;
-typedef struct _U64_S {
- U64 v;
-} U64_S;
-
-#ifndef LZ4_FORCE_UNALIGNED_ACCESS
-#pragma pack()
+#define LZ4_DISTANCE_ABSOLUTE_MAX 65535
+#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */
+# error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
#endif
-#define A64(x) (((U64_S *)(x))->v)
-#define A32(x) (((U32_S *)(x))->v)
-#define A16(x) (((U16_S *)(x))->v)
+#define ML_BITS 4
+#define ML_MASK ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
-/*
- * Constants
- */
-#define MINMATCH 4
+#define DEBUGLOG(l, ...) {} /* disabled */
-#define HASH_LOG COMPRESSIONLEVEL
-#define HASHTABLESIZE (1 << HASH_LOG)
-#define HASH_MASK (HASHTABLESIZE - 1)
+#ifndef assert
+#define assert ASSERT
+#endif
-#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
- NOTCOMPRESSIBLE_CONFIRMATION : 2)
+/*-************************************
+* Types
+**************************************/
+#ifndef _KERNEL
+#include <limits.h>
+#endif
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#ifndef _KERNEL
+#include <stdint.h>
+#endif
+ typedef uint8_t BYTE;
+ typedef uint16_t U16;
+ typedef uint32_t U32;
+ typedef int32_t S32;
+ typedef uint64_t U64;
+ typedef uintptr_t uptrval;
+#else
+# if UINT_MAX != 4294967295UL
+# error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+# endif
+ typedef unsigned char BYTE;
+ typedef unsigned short U16;
+ typedef unsigned int U32;
+ typedef signed int S32;
+ typedef unsigned long long U64;
+ typedef size_t uptrval; /* generally true, except OpenVMS-64 */
+#endif
-#define COPYLENGTH 8
-#define LASTLITERALS 5
-#define MFLIMIT (COPYLENGTH + MINMATCH)
-#define MINLENGTH (MFLIMIT + 1)
+#if defined(__x86_64__)
+ typedef U64 reg_t; /* 64-bits in x32 mode */
+#else
+ typedef size_t reg_t; /* 32-bits in x32 mode */
+#endif
-#define MAXD_LOG 16
-#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+typedef enum {
+ notLimited = 0,
+ limitedOutput = 1,
+ fillOutput = 2
+} limitedOutput_directive;
-#define ML_BITS 4
-#define ML_MASK ((1U<<ML_BITS)-1)
-#define RUN_BITS (8-ML_BITS)
-#define RUN_MASK ((1U<<RUN_BITS)-1)
+/*-************************************
+* Reading and writing into memory
+**************************************/
-/*
- * Architecture-specific macros
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
*/
-#if LZ4_ARCH64
-#define STEPSIZE 8
-#define UARCH U64
-#define AARCH A64
-#define LZ4_COPYSTEP(s, d) A64(d) = A64(s); d += 8; s += 8;
-#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d)
-#define LZ4_SECURECOPY(s, d, e) if (d < e) LZ4_WILDCOPY(s, d, e)
-#define HTYPE U32
-#define INITBASE(base) const BYTE* const base = ip
-#else /* !LZ4_ARCH64 */
-#define STEPSIZE 4
-#define UARCH U32
-#define AARCH A32
-#define LZ4_COPYSTEP(s, d) A32(d) = A32(s); d += 4; s += 4;
-#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
-#define LZ4_SECURECOPY LZ4_WILDCOPY
-#define HTYPE const BYTE *
-#define INITBASE(base) const int base = 0
-#endif /* !LZ4_ARCH64 */
-
-#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
-#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
- { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
-#define LZ4_WRITE_LITTLEENDIAN_16(p, i) \
- { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
#else
-#define LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
-#define LZ4_WRITE_LITTLEENDIAN_16(p, v) { A16(p) = v; p += 2; }
+#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
#endif
+static unsigned LZ4_isLittleEndian(void)
+{
+ const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */
+ return one.c[0];
+}
-/* Local structures */
-struct refTables {
- HTYPE hashTable[HASHTABLESIZE];
-};
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+/* lie to the compiler about data alignment; use with caution */
-/* Macros */
-#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
- HASH_LOG))
-#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
-#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
-#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
- d = e; }
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
-/* Private functions */
-#if LZ4_ARCH64
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
-static inline int
-LZ4_NbCommonBytes(register U64 val)
-{
-#if defined(LZ4_BIG_ENDIAN)
-#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
- !defined(LZ4_FORCE_SW_BITCOUNT)
- return (__builtin_clzll(val) >> 3);
-#else
- int r;
- if (!(val >> 32)) {
- r = 4;
- } else {
- r = 0;
- val >>= 32;
- }
- if (!(val >> 16)) {
- r += 2;
- val >>= 8;
- } else {
- val >>= 24;
- }
- r += (!val);
- return (r);
-#endif
-#else
-#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
- !defined(LZ4_FORCE_SW_BITCOUNT)
- return (__builtin_ctzll(val) >> 3);
-#else
- static const int DeBruijnBytePos[64] =
- { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
- 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
- 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
- 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
- };
- return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
- 58];
-#endif
-#endif
-}
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign;
-#else
+static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+
+static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+
+#else /* safe and portable access using memcpy() */
-static inline int
-LZ4_NbCommonBytes(register U32 val)
+static U16 LZ4_read16(const void* memPtr)
{
-#if defined(LZ4_BIG_ENDIAN)
-#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
- !defined(LZ4_FORCE_SW_BITCOUNT)
- return (__builtin_clz(val) >> 3);
-#else
- int r;
- if (!(val >> 16)) {
- r = 2;
- val >>= 8;
- } else {
- r = 0;
- val >>= 24;
- }
- r += (!val);
- return (r);
-#endif
-#else
-#if defined(__GNUC__) && (GCC_VERSION >= 304) && \
- !defined(LZ4_FORCE_SW_BITCOUNT)
- return (__builtin_ctz(val) >> 3);
-#else
- static const int DeBruijnBytePos[32] = {
- 0, 0, 3, 0, 3, 1, 3, 0,
- 3, 2, 2, 1, 3, 2, 0, 1,
- 3, 3, 1, 2, 2, 2, 2, 0,
- 3, 1, 2, 0, 1, 0, 1, 1
- };
- return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
- 27];
-#endif
-#endif
+ U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
}
-#endif
+static void LZ4_write32(void* memPtr, U32 value)
+{
+ LZ4_memcpy(memPtr, &value, sizeof(value));
+}
-/* Compression functions */
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
-/*ARGSUSED*/
-static int
-LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
- int osize)
+static U16 LZ4_readLE16(const void* memPtr)
{
- struct refTables *srt = (struct refTables *)ctx;
- HTYPE *HashTable = (HTYPE *) (srt->hashTable);
-
- const BYTE *ip = (BYTE *) source;
- INITBASE(base);
- const BYTE *anchor = ip;
- const BYTE *const iend = ip + isize;
- const BYTE *const oend = (BYTE *) dest + osize;
- const BYTE *const mflimit = iend - MFLIMIT;
-#define matchlimit (iend - LASTLITERALS)
-
- BYTE *op = (BYTE *) dest;
-
- int len, length;
- const int skipStrength = SKIPSTRENGTH;
- U32 forwardH;
-
-
- /* Init */
- if (isize < MINLENGTH)
- goto _last_literals;
-
- /* First Byte */
- HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
- ip++;
- forwardH = LZ4_HASH_VALUE(ip);
-
- /* Main Loop */
- for (;;) {
- int findMatchAttempts = (1U << skipStrength) + 3;
- const BYTE *forwardIp = ip;
- const BYTE *ref;
- BYTE *token;
-
- /* Find a match */
- do {
- U32 h = forwardH;
- int step = findMatchAttempts++ >> skipStrength;
- ip = forwardIp;
- forwardIp = ip + step;
-
- if (unlikely(forwardIp > mflimit)) {
- goto _last_literals;
- }
-
- forwardH = LZ4_HASH_VALUE(forwardIp);
- ref = base + HashTable[h];
- HashTable[h] = ip - base;
-
- } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
-
- /* Catch up */
- while ((ip > anchor) && (ref > (BYTE *) source) &&
- unlikely(ip[-1] == ref[-1])) {
- ip--;
- ref--;
- }
-
- /* Encode Literal length */
- length = ip - anchor;
- token = op++;
-
- /* Check output limit */
- if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
- (length >> 8) > oend))
- return (0);
-
- if (length >= (int)RUN_MASK) {
- *token = (RUN_MASK << ML_BITS);
- len = length - RUN_MASK;
- for (; len > 254; len -= 255)
- *op++ = 255;
- *op++ = (BYTE)len;
- } else
- *token = (length << ML_BITS);
-
- /* Copy Literals */
- LZ4_BLINDCOPY(anchor, op, length);
-
- _next_match:
- /* Encode Offset */
- LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
-
- /* Start Counting */
- ip += MINMATCH;
- ref += MINMATCH; /* MinMatch verified */
- anchor = ip;
- while (likely(ip < matchlimit - (STEPSIZE - 1))) {
- UARCH diff = AARCH(ref) ^ AARCH(ip);
- if (!diff) {
- ip += STEPSIZE;
- ref += STEPSIZE;
- continue;
- }
- ip += LZ4_NbCommonBytes(diff);
- goto _endCount;
- }
-#if LZ4_ARCH64
- if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
- ip += 4;
- ref += 4;
- }
-#endif
- if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
- ip += 2;
- ref += 2;
- }
- if ((ip < matchlimit) && (*ref == *ip))
- ip++;
- _endCount:
-
- /* Encode MatchLength */
- len = (ip - anchor);
- /* Check output limit */
- if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
- return (0);
- if (len >= (int)ML_MASK) {
- *token += ML_MASK;
- len -= ML_MASK;
- for (; len > 509; len -= 510) {
- *op++ = 255;
- *op++ = 255;
- }
- if (len > 254) {
- len -= 255;
- *op++ = 255;
- }
- *op++ = (BYTE)len;
- } else
- *token += len;
-
- /* Test end of chunk */
- if (ip > mflimit) {
- anchor = ip;
- break;
- }
- /* Fill table */
- HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
-
- /* Test next position */
- ref = base + HashTable[LZ4_HASH_VALUE(ip)];
- HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
- if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
- token = op++;
- *token = 0;
- goto _next_match;
- }
- /* Prepare next loop */
- anchor = ip++;
- forwardH = LZ4_HASH_VALUE(ip);
- }
-
- _last_literals:
- /* Encode Last Literals */
- {
- int lastRun = iend - anchor;
- if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
- oend)
- return (0);
- if (lastRun >= (int)RUN_MASK) {
- *op++ = (RUN_MASK << ML_BITS);
- lastRun -= RUN_MASK;
- for (; lastRun > 254; lastRun -= 255) {
- *op++ = 255;
- }
- *op++ = (BYTE)lastRun;
- } else
- *op++ = (lastRun << ML_BITS);
- (void) memcpy(op, anchor, iend - anchor);
- op += iend - anchor;
- }
-
- /* End */
- return (int)(((char *)op) - dest);
+ if (LZ4_isLittleEndian()) {
+ return LZ4_read16(memPtr);
+ } else {
+ const BYTE* p = (const BYTE*)memPtr;
+ return (U16)((U16)p[0] + (p[1]<<8));
+ }
}
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+LZ4_FORCE_INLINE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+ BYTE* d = (BYTE*)dstPtr;
+ const BYTE* s = (const BYTE*)srcPtr;
+ BYTE* const e = (BYTE*)dstEnd;
+ do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+}
-/* Note : this function is valid only if isize < LZ4_64KLIMIT */
-#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
-#define HASHLOG64K (HASH_LOG + 1)
-#define HASH64KTABLESIZE (1U << HASHLOG64K)
-#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \
- HASHLOG64K))
-#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p))
+static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
+static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
+
+
+#ifndef LZ4_FAST_DEC_LOOP
+# if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+# define LZ4_FAST_DEC_LOOP 1
+# elif defined(__aarch64__) && !defined(__clang__)
+ /* On aarch64, we disable this optimization for clang because on certain
+ * mobile chipsets, performance is reduced with clang. For information
+ * refer to https://github.com/lz4/lz4/pull/707 */
+# define LZ4_FAST_DEC_LOOP 1
+# else
+# define LZ4_FAST_DEC_LOOP 0
+# endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
-/*ARGSUSED*/
-static int
-LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
- int osize)
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
{
- struct refTables *srt = (struct refTables *)ctx;
- U16 *HashTable = (U16 *) (srt->hashTable);
-
- const BYTE *ip = (BYTE *) source;
- const BYTE *anchor = ip;
- const BYTE *const base = ip;
- const BYTE *const iend = ip + isize;
- const BYTE *const oend = (BYTE *) dest + osize;
- const BYTE *const mflimit = iend - MFLIMIT;
-#define matchlimit (iend - LASTLITERALS)
-
- BYTE *op = (BYTE *) dest;
-
- int len, length;
- const int skipStrength = SKIPSTRENGTH;
- U32 forwardH;
-
- /* Init */
- if (isize < MINLENGTH)
- goto _last_literals;
-
- /* First Byte */
- ip++;
- forwardH = LZ4_HASH64K_VALUE(ip);
-
- /* Main Loop */
- for (;;) {
- int findMatchAttempts = (1U << skipStrength) + 3;
- const BYTE *forwardIp = ip;
- const BYTE *ref;
- BYTE *token;
-
- /* Find a match */
- do {
- U32 h = forwardH;
- int step = findMatchAttempts++ >> skipStrength;
- ip = forwardIp;
- forwardIp = ip + step;
-
- if (forwardIp > mflimit) {
- goto _last_literals;
- }
-
- forwardH = LZ4_HASH64K_VALUE(forwardIp);
- ref = base + HashTable[h];
- HashTable[h] = ip - base;
-
- } while (A32(ref) != A32(ip));
-
- /* Catch up */
- while ((ip > anchor) && (ref > (BYTE *) source) &&
- (ip[-1] == ref[-1])) {
- ip--;
- ref--;
- }
-
- /* Encode Literal length */
- length = ip - anchor;
- token = op++;
-
- /* Check output limit */
- if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
- (length >> 8) > oend))
- return (0);
-
- if (length >= (int)RUN_MASK) {
- *token = (RUN_MASK << ML_BITS);
- len = length - RUN_MASK;
- for (; len > 254; len -= 255)
- *op++ = 255;
- *op++ = (BYTE)len;
- } else
- *token = (length << ML_BITS);
-
- /* Copy Literals */
- LZ4_BLINDCOPY(anchor, op, length);
-
- _next_match:
- /* Encode Offset */
- LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
-
- /* Start Counting */
- ip += MINMATCH;
- ref += MINMATCH; /* MinMatch verified */
- anchor = ip;
- while (ip < matchlimit - (STEPSIZE - 1)) {
- UARCH diff = AARCH(ref) ^ AARCH(ip);
- if (!diff) {
- ip += STEPSIZE;
- ref += STEPSIZE;
- continue;
- }
- ip += LZ4_NbCommonBytes(diff);
- goto _endCount;
- }
-#if LZ4_ARCH64
- if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
- ip += 4;
- ref += 4;
- }
-#endif
- if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
- ip += 2;
- ref += 2;
- }
- if ((ip < matchlimit) && (*ref == *ip))
- ip++;
- _endCount:
-
- /* Encode MatchLength */
- len = (ip - anchor);
- /* Check output limit */
- if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
- return (0);
- if (len >= (int)ML_MASK) {
- *token += ML_MASK;
- len -= ML_MASK;
- for (; len > 509; len -= 510) {
- *op++ = 255;
- *op++ = 255;
- }
- if (len > 254) {
- len -= 255;
- *op++ = 255;
- }
- *op++ = (BYTE)len;
- } else
- *token += len;
-
- /* Test end of chunk */
- if (ip > mflimit) {
- anchor = ip;
- break;
- }
- /* Fill table */
- HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
-
- /* Test next position */
- ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
- HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
- if (A32(ref) == A32(ip)) {
- token = op++;
- *token = 0;
- goto _next_match;
- }
- /* Prepare next loop */
- anchor = ip++;
- forwardH = LZ4_HASH64K_VALUE(ip);
- }
-
- _last_literals:
- /* Encode Last Literals */
- {
- int lastRun = iend - anchor;
- if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
- oend)
- return (0);
- if (lastRun >= (int)RUN_MASK) {
- *op++ = (RUN_MASK << ML_BITS);
- lastRun -= RUN_MASK;
- for (; lastRun > 254; lastRun -= 255)
- *op++ = 255;
- *op++ = (BYTE)lastRun;
- } else
- *op++ = (lastRun << ML_BITS);
- (void) memcpy(op, anchor, iend - anchor);
- op += iend - anchor;
- }
-
- /* End */
- return (int)(((char *)op) - dest);
+ assert(srcPtr + offset == dstPtr);
+ if (offset < 8) {
+ LZ4_write32(dstPtr, 0); /* silence an msan warning when offset==0 */
+ dstPtr[0] = srcPtr[0];
+ dstPtr[1] = srcPtr[1];
+ dstPtr[2] = srcPtr[2];
+ dstPtr[3] = srcPtr[3];
+ srcPtr += inc32table[offset];
+ LZ4_memcpy(dstPtr+4, srcPtr, 4);
+ srcPtr -= dec64table[offset];
+ dstPtr += 8;
+ } else {
+ LZ4_memcpy(dstPtr, srcPtr, 8);
+ dstPtr += 8;
+ srcPtr += 8;
+ }
+
+ LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
}
-static int
-real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_INLINE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
{
- void *ctx;
- int result;
-
- ctx = lz4_alloc(KM_SLEEP);
-
- /*
- * out of kernel memory, gently fall through - this will disable
- * compression in zio_compress_data
- */
- if (ctx == NULL)
- return (0);
+ BYTE* d = (BYTE*)dstPtr;
+ const BYTE* s = (const BYTE*)srcPtr;
+ BYTE* const e = (BYTE*)dstEnd;
- memset(ctx, 0, sizeof (struct refTables));
-
- if (isize < LZ4_64KLIMIT)
- result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
- else
- result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+ do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+}
- lz4_free(ctx);
- return (result);
+/* LZ4_memcpy_using_offset() presumes :
+ * - dstEnd >= dstPtr + MINMATCH
+ * - there is at least 8 bytes available to write after dstEnd */
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+ BYTE v[8];
+
+ assert(dstEnd >= dstPtr + MINMATCH);
+
+ switch(offset) {
+ case 1:
+ MEM_INIT(v, *srcPtr, 8);
+ break;
+ case 2:
+ LZ4_memcpy(v, srcPtr, 2);
+ LZ4_memcpy(&v[2], srcPtr, 2);
+ LZ4_memcpy(&v[4], v, 4);
+ break;
+ case 4:
+ LZ4_memcpy(v, srcPtr, 4);
+ LZ4_memcpy(&v[4], srcPtr, 4);
+ break;
+ default:
+ LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+ return;
+ }
+
+ LZ4_memcpy(dstPtr, v, 8);
+ dstPtr += 8;
+ while (dstPtr < dstEnd) {
+ LZ4_memcpy(dstPtr, v, 8);
+ dstPtr += 8;
+ }
}
+#endif
-/* Decompression functions */
-/*
- * Note: The decoding functions real_LZ4_uncompress() and
- * LZ4_uncompress_unknownOutputSize() are safe against "buffer overflow"
- * attack type. They will never write nor read outside of the provided
- * output buffers. LZ4_uncompress_unknownOutputSize() also insures that
- * it will never read outside of the input buffer. A corrupted input
- * will produce an error result, a negative int, indicating the position
- * of the error within input stream.
+/*-************************************
+* Local Structures and types
+**************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
*
- * Note[2]: real_LZ4_uncompress(), referred to above, is not used in ZFS so
- * its code is not present here.
+ * - noDict : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ * blob being compressed are valid and refer to the preceding
+ * content (of length ctx->dictSize), which is available
+ * contiguously preceding in memory the content currently
+ * being compressed.
+ * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere
+ * else in memory, starting at ctx->dictionary with length
+ * ctx->dictSize.
+ * - usingDictCtx : Like usingExtDict, but everything concerning the preceding
+ * content is in a separate context, pointed to by
+ * ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table
+ * entries in the current context that refer to positions
+ * preceding the beginning of the current compression are
+ * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ * ->dictSize describe the location and size of the preceding
+ * content, and matches are found by looking in the ctx
+ * ->dictCtx->hashTable.
*/
+typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
-static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
-static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
-#endif
+/*-*******************************
+ * Decompression functions
+ ********************************/
-static int
-LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize,
- int maxOutputSize)
-{
- /* Local Variables */
- const BYTE *restrict ip = (const BYTE *) source;
- const BYTE *const iend = ip + isize;
- const BYTE *ref;
-
- BYTE *op = (BYTE *) dest;
- BYTE *const oend = op + maxOutputSize;
- BYTE *cpy;
-
- /* Main Loop */
- while (ip < iend) {
- unsigned token;
- size_t length;
-
- /* get runlength */
- token = *ip++;
- if ((length = (token >> ML_BITS)) == RUN_MASK) {
- int s = 255;
- while ((ip < iend) && (s == 255)) {
- s = *ip++;
- if (unlikely(length > (size_t)(length + s)))
- goto _output_error;
- length += s;
- }
- }
- /* copy literals */
- cpy = op + length;
- /* CORNER-CASE: cpy might overflow. */
- if (cpy < op)
- goto _output_error; /* cpy was overflowed, bail! */
- if ((cpy > oend - COPYLENGTH) ||
- (ip + length > iend - COPYLENGTH)) {
- if (cpy > oend)
- /* Error: writes beyond output buffer */
- goto _output_error;
- if (ip + length != iend)
- /*
- * Error: LZ4 format requires to consume all
- * input at this stage
- */
- goto _output_error;
- (void) memcpy(op, ip, length);
- op += length;
- /* Necessarily EOF, due to parsing restrictions */
- break;
- }
- LZ4_WILDCOPY(ip, op, cpy);
- ip -= (op - cpy);
- op = cpy;
-
- /* get offset */
- LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
- ip += 2;
- if (ref < (BYTE * const) dest)
- /*
- * Error: offset creates reference outside of
- * destination buffer
- */
- goto _output_error;
-
- /* get matchlength */
- if ((length = (token & ML_MASK)) == ML_MASK) {
- while (ip < iend) {
- int s = *ip++;
- if (unlikely(length > (size_t)(length + s)))
- goto _output_error;
- length += s;
- if (s == 255)
- continue;
- break;
- }
- }
- /* copy repeated sequence */
- if (unlikely(op - ref < STEPSIZE)) {
-#if LZ4_ARCH64
- int dec64 = dec64table[op - ref];
-#else
- const int dec64 = 0;
-#endif
- op[0] = ref[0];
- op[1] = ref[1];
- op[2] = ref[2];
- op[3] = ref[3];
- op += 4;
- ref += 4;
- ref -= dec32table[op - ref];
- A32(op) = A32(ref);
- op += STEPSIZE - 4;
- ref -= dec64;
- } else {
- LZ4_COPYSTEP(ref, op);
- }
- cpy = op + length - (STEPSIZE - 4);
- if (cpy > oend - COPYLENGTH) {
- if (cpy > oend)
- /*
- * Error: request to write outside of
- * destination buffer
- */
- goto _output_error;
-#if LZ4_ARCH64
- if ((ref + COPYLENGTH) > oend)
-#else
- if ((ref + COPYLENGTH) > oend ||
- (op + COPYLENGTH) > oend)
-#endif
- goto _output_error;
- LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
- while (op < cpy)
- *op++ = *ref++;
- op = cpy;
- if (op == oend)
- /*
- * Check EOF (should never happen, since
- * last 5 bytes are supposed to be literals)
- */
- goto _output_error;
- continue;
- }
- LZ4_SECURECOPY(ref, op, cpy);
- op = cpy; /* correction */
- }
-
- /* end of decoding */
- return (int)(((char *)op) - dest);
-
- /* write overflow error detected */
- _output_error:
- return (-1);
-}
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
-#ifdef __FreeBSD__
-/*
- * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here.
- * Should struct refTables get resized this may need to be revisited, hence
- * compiler-time asserts.
- */
-_Static_assert(sizeof(struct refTables) <= 16384,
- "refTables too big for malloc");
-_Static_assert((sizeof(struct refTables) % 4096) == 0,
- "refTables not a multiple of page size");
-#else
-#define ZFS_LZ4_USE_CACHE
-#endif
+typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error;
-#ifdef ZFS_LZ4_USE_CACHE
-static kmem_cache_t *lz4_cache;
-
-void
-lz4_init(void)
+LZ4_FORCE_INLINE unsigned
+read_variable_length(const BYTE**ip, const BYTE* lencheck,
+ int loop_check, int initial_check,
+ variable_length_error* error)
{
- lz4_cache = kmem_cache_create("lz4_cache",
- sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ U32 length = 0;
+ U32 s;
+ if (initial_check && unlikely((*ip) >= lencheck)) { /* overflow detection */
+ *error = initial_error;
+ return length;
+ }
+ do {
+ s = **ip;
+ (*ip)++;
+ length += s;
+ if (loop_check && unlikely((*ip) >= lencheck)) { /* overflow detection */
+ *error = loop_error;
+ return length;
+ }
+ } while (s==255);
+
+ return length;
}
-void
-lz4_fini(void)
-{
- if (lz4_cache) {
- kmem_cache_destroy(lz4_cache);
- lz4_cache = NULL;
- }
-}
+#define LZ4_STATIC_ASSERT(c) ASSERT(c)
-static void *
-lz4_alloc(int flags)
-{
- ASSERT(lz4_cache != NULL);
- return (kmem_cache_alloc(lz4_cache, flags));
-}
-static void
-lz4_free(void *ctx)
-{
- kmem_cache_free(lz4_cache, ctx);
-}
-#else
-void
-lz4_init(void)
+/*! LZ4_decompress_generic() :
+ * This generic decompression function covers all use cases.
+ * It shall be instantiated several times, using different sets of directives.
+ * Note that it is important for performance that this function really get inlined,
+ * in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int
+LZ4_decompress_generic(
+ const char* const src,
+ char* const dst,
+ int srcSize,
+ int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+
+ endCondition_directive endOnInput, /* endOnOutputSize, endOnInputSize */
+ earlyEnd_directive partialDecoding, /* full, partial */
+ dict_directive dict, /* noDict, withPrefix64k, usingExtDict */
+ const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */
+ const BYTE* const dictStart, /* only if dict==usingExtDict */
+ const size_t dictSize /* note : = 0 if noDict */
+ )
{
-}
+ if ((src == NULL) || (outputSize < 0)) { return -1; }
+
+ { const BYTE* ip = (const BYTE*) src;
+ const BYTE* const iend = ip + srcSize;
+
+ BYTE* op = (BYTE*) dst;
+ BYTE* const oend = op + outputSize;
+ BYTE* cpy;
+
+ const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+ const int safeDecode = (endOnInput==endOnInputSize);
+ const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+ /* Set up the "end" pointers for the shortcut. */
+ const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+ const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+ const BYTE* match;
+ size_t offset;
+ unsigned token;
+ size_t length;
+
+
+ DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+
+ /* Special cases */
+ assert(lowPrefix <= op);
+ if ((endOnInput) && (unlikely(outputSize==0))) {
+ /* Empty output buffer */
+ if (partialDecoding) return 0;
+ return ((srcSize==1) && (*ip==0)) ? 0 : -1;
+ }
+ if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); }
+ if ((endOnInput) && unlikely(srcSize==0)) { return -1; }
+
+ /* Currently the fast loop shows a regression on qualcomm arm chips. */
+#if LZ4_FAST_DEC_LOOP
+ if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+ DEBUGLOG(6, "skip fast decode loop");
+ goto safe_decode;
+ }
+
+ /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
+ while (1) {
+ /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+ assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+ if (endOnInput) { assert(ip < iend); }
+ token = *ip++;
+ length = token >> ML_BITS; /* literal length */
+
+ assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+ /* decode literal length */
+ if (length == RUN_MASK) {
+ variable_length_error error = ok;
+ length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+ if (error == initial_error) { goto _output_error; }
+ if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+ if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+
+ /* copy literals */
+ cpy = op+length;
+ LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+ if (endOnInput) { /* LZ4_decompress_safe() */
+ if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
+ LZ4_wildCopy32(op, ip, cpy);
+ } else { /* LZ4_decompress_fast() */
+ if (cpy>oend-8) { goto safe_literal_copy; }
+ LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+ * it doesn't know input length, and only relies on end-of-block properties */
+ }
+ ip += length; op = cpy;
+ } else {
+ cpy = op+length;
+ if (endOnInput) { /* LZ4_decompress_safe() */
+ DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+ /* We don't need to check oend, since we check it once for each loop below */
+ if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
+ /* Literals can only be 14, but hope compilers optimize if we copy by a register size */
+ LZ4_memcpy(op, ip, 16);
+ } else { /* LZ4_decompress_fast() */
+ /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+ * it doesn't know input length, and relies on end-of-block properties */
+ LZ4_memcpy(op, ip, 8);
+ if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); }
+ }
+ ip += length; op = cpy;
+ }
+
+ /* get offset */
+ offset = LZ4_readLE16(ip); ip+=2;
+ match = op - offset;
+ assert(match <= op);
+
+ /* get matchlength */
+ length = token & ML_MASK;
+
+ if (length == ML_MASK) {
+ variable_length_error error = ok;
+ if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+ length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+ if (error != ok) { goto _output_error; }
+ if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+ length += MINMATCH;
+ if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+ goto safe_match_copy;
+ }
+ } else {
+ length += MINMATCH;
+ if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+ goto safe_match_copy;
+ }
+
+ /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
+ if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+ if (offset >= 8) {
+ assert(match >= lowPrefix);
+ assert(match <= op);
+ assert(op + 18 <= oend);
+
+ LZ4_memcpy(op, match, 8);
+ LZ4_memcpy(op+8, match+8, 8);
+ LZ4_memcpy(op+16, match+16, 2);
+ op += length;
+ continue;
+ } } }
+
+ if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+ /* match starting within external dictionary */
+ if ((dict==usingExtDict) && (match < lowPrefix)) {
+ if (unlikely(op+length > oend-LASTLITERALS)) {
+ if (partialDecoding) {
+ DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+ length = MIN(length, (size_t)(oend-op));
+ } else {
+ goto _output_error; /* end-of-block condition violated */
+ } }
+
+ if (length <= (size_t)(lowPrefix-match)) {
+ /* match fits entirely within external dictionary : just copy */
+ memmove(op, dictEnd - (lowPrefix-match), length);
+ op += length;
+ } else {
+ /* match stretches into both external dictionary and current block */
+ size_t const copySize = (size_t)(lowPrefix - match);
+ size_t const restSize = length - copySize;
+ LZ4_memcpy(op, dictEnd - copySize, copySize);
+ op += copySize;
+ if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
+ BYTE* const endOfMatch = op + restSize;
+ const BYTE* copyFrom = lowPrefix;
+ while (op < endOfMatch) { *op++ = *copyFrom++; }
+ } else {
+ LZ4_memcpy(op, lowPrefix, restSize);
+ op += restSize;
+ } }
+ continue;
+ }
+
+ /* copy match within block */
+ cpy = op + length;
+
+ assert((op <= oend) && (oend-op >= 32));
+ if (unlikely(offset<16)) {
+ LZ4_memcpy_using_offset(op, match, cpy, offset);
+ } else {
+ LZ4_wildCopy32(op, match, cpy);
+ }
+
+ op = cpy; /* wildcopy correction */
+ }
+ safe_decode:
+#endif
-void
-lz4_fini(void)
-{
+ /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+ while (1) {
+ token = *ip++;
+ length = token >> ML_BITS; /* literal length */
+
+ assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+ /* A two-stage shortcut for the most common case:
+ * 1) If the literal length is 0..14, and there is enough space,
+ * enter the shortcut and copy 16 bytes on behalf of the literals
+ * (in the fast mode, only 8 bytes can be safely copied this way).
+ * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+ * manner; but we ensure that there's enough space in the output for
+ * those 18 bytes earlier, upon entering the shortcut (in other words,
+ * there is a combined check for both stages).
+ */
+ if ( (endOnInput ? length != RUN_MASK : length <= 8)
+ /* strictly "less than" on input, to re-enter the loop with at least one byte */
+ && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
+ /* Copy the literals */
+ LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
+ op += length; ip += length;
+
+ /* The second stage: prepare for match copying, decode full info.
+ * If it doesn't work out, the info won't be wasted. */
+ length = token & ML_MASK; /* match length */
+ offset = LZ4_readLE16(ip); ip += 2;
+ match = op - offset;
+ assert(match <= op); /* check overflow */
+
+ /* Do not deal with overlapping matches. */
+ if ( (length != ML_MASK)
+ && (offset >= 8)
+ && (dict==withPrefix64k || match >= lowPrefix) ) {
+ /* Copy the match. */
+ LZ4_memcpy(op + 0, match + 0, 8);
+ LZ4_memcpy(op + 8, match + 8, 8);
+ LZ4_memcpy(op +16, match +16, 2);
+ op += length + MINMATCH;
+ /* Both stages worked, load the next token. */
+ continue;
+ }
+
+ /* The second stage didn't work out, but the info is ready.
+ * Propel it right to the point of match copying. */
+ goto _copy_match;
+ }
+
+ /* decode literal length */
+ if (length == RUN_MASK) {
+ variable_length_error error = ok;
+ length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+ if (error == initial_error) { goto _output_error; }
+ if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+ if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+ }
+
+ /* copy literals */
+ cpy = op+length;
+#if LZ4_FAST_DEC_LOOP
+ safe_literal_copy:
+#endif
+ LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+ if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) )
+ || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) )
+ {
+ /* We've either hit the input parsing restriction or the output parsing restriction.
+ * In the normal scenario, decoding a full block, it must be the last sequence,
+ * otherwise it's an error (invalid input or dimensions).
+ * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
+ */
+ if (partialDecoding) {
+ /* Since we are partial decoding we may be in this block because of the output parsing
+ * restriction, which is not valid since the output buffer is allowed to be undersized.
+ */
+ assert(endOnInput);
+ DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+ DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+ DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+ DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+ /* Finishing in the middle of a literals segment,
+ * due to lack of input.
+ */
+ if (ip+length > iend) {
+ length = (size_t)(iend-ip);
+ cpy = op + length;
+ }
+ /* Finishing in the middle of a literals segment,
+ * due to lack of output space.
+ */
+ if (cpy > oend) {
+ cpy = oend;
+ assert(op<=oend);
+ length = (size_t)(oend-op);
+ }
+ } else {
+ /* We must be on the last sequence because of the parsing limitations so check
+ * that we exactly regenerate the original size (must be exact when !endOnInput).
+ */
+ if ((!endOnInput) && (cpy != oend)) { goto _output_error; }
+ /* We must be on the last sequence (or invalid) because of the parsing limitations
+ * so check that we exactly consume the input and don't overrun the output buffer.
+ */
+ if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) {
+ DEBUGLOG(6, "should have been last run of literals")
+ DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend);
+ DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend);
+ goto _output_error;
+ }
+ }
+ memmove(op, ip, length); /* supports overlapping memory regions; only matters for in-place decompression scenarios */
+ ip += length;
+ op += length;
+ /* Necessarily EOF when !partialDecoding.
+ * When partialDecoding, it is EOF if we've either
+ * filled the output buffer or
+ * can't proceed with reading an offset for following match.
+ */
+ if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
+ break;
+ }
+ } else {
+ LZ4_wildCopy8(op, ip, cpy); /* may overwrite up to WILDCOPYLENGTH beyond cpy */
+ ip += length; op = cpy;
+ }
+
+ /* get offset */
+ offset = LZ4_readLE16(ip); ip+=2;
+ match = op - offset;
+
+ /* get matchlength */
+ length = token & ML_MASK;
+
+ _copy_match:
+ if (length == ML_MASK) {
+ variable_length_error error = ok;
+ length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+ if (error != ok) goto _output_error;
+ if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */
+ }
+ length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+ safe_match_copy:
+#endif
+ if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */
+ /* match starting within external dictionary */
+ if ((dict==usingExtDict) && (match < lowPrefix)) {
+ if (unlikely(op+length > oend-LASTLITERALS)) {
+ if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+ else goto _output_error; /* doesn't respect parsing restriction */
+ }
+
+ if (length <= (size_t)(lowPrefix-match)) {
+ /* match fits entirely within external dictionary : just copy */
+ memmove(op, dictEnd - (lowPrefix-match), length);
+ op += length;
+ } else {
+ /* match stretches into both external dictionary and current block */
+ size_t const copySize = (size_t)(lowPrefix - match);
+ size_t const restSize = length - copySize;
+ LZ4_memcpy(op, dictEnd - copySize, copySize);
+ op += copySize;
+ if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
+ BYTE* const endOfMatch = op + restSize;
+ const BYTE* copyFrom = lowPrefix;
+ while (op < endOfMatch) *op++ = *copyFrom++;
+ } else {
+ LZ4_memcpy(op, lowPrefix, restSize);
+ op += restSize;
+ } }
+ continue;
+ }
+ assert(match >= lowPrefix);
+
+ /* copy match within block */
+ cpy = op + length;
+
+ /* partialDecoding : may end anywhere within the block */
+ assert(op<=oend);
+ if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+ size_t const mlen = MIN(length, (size_t)(oend-op));
+ const BYTE* const matchEnd = match + mlen;
+ BYTE* const copyEnd = op + mlen;
+ if (matchEnd > op) { /* overlap copy */
+ while (op < copyEnd) { *op++ = *match++; }
+ } else {
+ LZ4_memcpy(op, match, mlen);
+ }
+ op = copyEnd;
+ if (op == oend) { break; }
+ continue;
+ }
+
+ if (unlikely(offset<8)) {
+ LZ4_write32(op, 0); /* silence msan warning when offset==0 */
+ op[0] = match[0];
+ op[1] = match[1];
+ op[2] = match[2];
+ op[3] = match[3];
+ match += inc32table[offset];
+ LZ4_memcpy(op+4, match, 4);
+ match -= dec64table[offset];
+ } else {
+ LZ4_memcpy(op, match, 8);
+ match += 8;
+ }
+ op += 8;
+
+ if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+ BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
+ if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+ if (op < oCopyLimit) {
+ LZ4_wildCopy8(op, match, oCopyLimit);
+ match += oCopyLimit - op;
+ op = oCopyLimit;
+ }
+ while (op < cpy) { *op++ = *match++; }
+ } else {
+ LZ4_memcpy(op, match, 8);
+ if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); }
+ }
+ op = cpy; /* wildcopy correction */
+ }
+
+ /* end of decoding */
+ if (endOnInput) {
+ DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
+ return (int) (((char*)op)-dst); /* Nb of output bytes decoded */
+ } else {
+ return (int) (((const char*)ip)-src); /* Nb of input bytes read */
+ }
+
+ /* Overflow error detected */
+ _output_error:
+ return (int) (-(((const char*)ip)-src))-1;
+ }
}
-static void *
-lz4_alloc(int flags)
-{
- return (kmem_alloc(sizeof (struct refTables), flags));
-}
+/*
+ * LZ4_uncompress_unknownOutputSize() :
+ * isize : is the input size, therefore the compressed size
+ * maxOutputSize : is the size of the destination buffer (which must be
+ * already allocated)
+ * return : the number of bytes decoded in the destination buffer
+ * (necessarily <= maxOutputSize). If the source stream is
+ * malformed, the function will stop decoding and return a
+ * negative result, indicating the byte position of the faulty
+ * instruction. This function never writes beyond dest +
+ * maxOutputSize, and is therefore protected against malicious
+ * data packets.
+ * note : Destination buffer must be already allocated.
+ * This version is slightly slower than real_LZ4_uncompress()
+ *
+ */
-static void
-lz4_free(void *ctx)
+/*
+ * Note: In upstream code, LZ4_uncompress_unknownOutputSize is now a legacy
+ * wrapper for LZ4_decompress_safe which is a wrapper for
+ * LZ4_decompress_generic; this wrapper flattens that, rather than
+ * rewriting the callers.
+ */
+int LZ4_uncompress_unknownOutputSize(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
{
- kmem_free(ctx, sizeof (struct refTables));
+ return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+ endOnInputSize, decode_full_block, noDict,
+ (BYTE*)dest, NULL, 0);
}
-#endif
diff --git a/sys/contrib/openzfs/module/zfs/lz4_zfs.c b/sys/contrib/openzfs/module/zfs/lz4_zfs.c
new file mode 100644
index 000000000000..820556effb8b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/lz4_zfs.c
@@ -0,0 +1,935 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Header File
+ * Copyright (C) 2011-2013, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+/*
+ * N.B. - This file seems to be based on LZ4 r85, dated Dec 10, 2012
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+
+static int real_LZ4_compress(const char *source, char *dest, int isize,
+ int osize);
+static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
+ int isize, int osize);
+static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
+ int isize, int osize);
+
+/* See lz4.c */
+int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+ int isize, int maxOutputSize);
+
+static void *lz4_alloc(int flags);
+static void lz4_free(void *ctx);
+
+size_t
+lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+ size_t d_len, int n)
+{
+ (void) n;
+ uint32_t bufsiz;
+ char *dest = d_start;
+
+ ASSERT(d_len >= sizeof (bufsiz));
+
+ bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
+ d_len - sizeof (bufsiz));
+
+ /* Signal an error if the compression routine returned zero. */
+ if (bufsiz == 0)
+ return (s_len);
+
+ /*
+ * The exact compressed size is needed by the decompression routine,
+ * so it is stored at the start of the buffer. Note that this may be
+ * less than the compressed block size, which is rounded up to a
+ * multiple of 1<<ashift.
+ */
+ *(uint32_t *)dest = BE_32(bufsiz);
+
+ return (bufsiz + sizeof (bufsiz));
+}
+
+int
+lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+ size_t d_len, int n)
+{
+ (void) n;
+ const char *src = s_start;
+ uint32_t bufsiz = BE_IN32(src);
+
+ /* invalid compressed buffer size encoded at start */
+ if (bufsiz + sizeof (bufsiz) > s_len)
+ return (1);
+
+ /*
+ * Returns 0 on success (decompression function returned non-negative)
+ * and non-zero on failure (decompression function returned negative).
+ */
+ return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
+ d_start, bufsiz, d_len) < 0);
+}
+
+/*
+ * LZ4 API Description:
+ *
+ * Simple Functions:
+ * real_LZ4_compress() :
+ * isize : is the input size. Max supported value is ~1.9GB
+ * return : the number of bytes written in buffer dest
+ * or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
+ * note : destination buffer must be already allocated.
+ * destination buffer must be sized to handle worst cases
+ * situations (input data not compressible) worst case size
+ * evaluation is provided by function LZ4_compressBound().
+ *
+ * real_LZ4_uncompress() :
+ * osize : is the output size, therefore the original size
+ * return : the number of bytes read in the source buffer.
+ * If the source stream is malformed, the function will stop
+ * decoding and return a negative result, indicating the byte
+ * position of the faulty instruction. This function never
+ * writes beyond dest + osize, and is therefore protected
+ * against malicious data packets.
+ * note : destination buffer must be already allocated
+ * note : real_LZ4_uncompress() is not used in ZFS so its code
+ * is not present here.
+ *
+ * Advanced Functions
+ *
+ * LZ4_compressBound() :
+ * Provides the maximum size that LZ4 may output in a "worst case"
+ * scenario (input data not compressible) primarily useful for memory
+ * allocation of output buffer.
+ *
+ * isize : is the input size. Max supported value is ~1.9GB
+ * return : maximum output size in a "worst case" scenario
+ * note : this function is limited by "int" range (2^31-1)
+ *
+ * LZ4_uncompress_unknownOutputSize() :
+ * isize : is the input size, therefore the compressed size
+ * maxOutputSize : is the size of the destination buffer (which must be
+ * already allocated)
+ * return : the number of bytes decoded in the destination buffer
+ * (necessarily <= maxOutputSize). If the source stream is
+ * malformed, the function will stop decoding and return a
+ * negative result, indicating the byte position of the faulty
+ * instruction. This function never writes beyond dest +
+ * maxOutputSize, and is therefore protected against malicious
+ * data packets.
+ * note : Destination buffer must be already allocated.
+ * This version is slightly slower than real_LZ4_uncompress()
+ *
+ * LZ4_compressCtx() :
+ * This function explicitly handles the CTX memory structure.
+ *
+ * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * NULL isn't valid.
+ *
+ * LZ4_compress64kCtx() :
+ * Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
+ * isize *Must* be <64KB, otherwise the output will be corrupted.
+ *
+ * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * NULL isn't valid.
+ */
+
+/*
+ * Tuning parameters
+ */
+
+/*
+ * COMPRESSIONLEVEL: Increasing this value improves compression ratio
+ * Lowering this value reduces memory usage. Reduced memory usage
+ * typically improves speed, due to cache effect (ex: L1 32KB for Intel,
+ * L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes
+ * (examples : 12 -> 16KB ; 17 -> 512KB)
+ */
+#define COMPRESSIONLEVEL 12
+
+/*
+ * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the
+ * algorithm skip faster data segments considered "incompressible".
+ * This may decrease compression ratio dramatically, but will be
+ * faster on incompressible data. Increasing this value will make
+ * the algorithm search more before declaring a segment "incompressible".
+ * This could improve compression a bit, but will be slower on
+ * incompressible data. The default value (6) is recommended.
+ */
+#define NOTCOMPRESSIBLE_CONFIRMATION 6
+
+/*
+ * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
+ * performance for big endian cpu, but the resulting compressed stream
+ * will be incompatible with little-endian CPU. You can set this option
+ * to 1 in situations where data will stay within closed environment.
+ * This option is useless on Little_Endian CPU (such as x86).
+ */
+/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
+
+/*
+ * CPU Feature Detection
+ */
+
+/* 32 or 64 bits ? */
+#if defined(_LP64)
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+/*
+ * Little Endian or Big Endian?
+ * Note: overwrite the below #define if you know your architecture endianness.
+ */
+#if defined(_ZFS_BIG_ENDIAN)
+#define LZ4_BIG_ENDIAN 1
+#else
+/*
+ * Little Endian assumed. PDP Endian and other very rare endian format
+ * are unsupported.
+ */
+#undef LZ4_BIG_ENDIAN
+#endif
+
+/*
+ * Unaligned memory access is automatically enabled for "common" CPU,
+ * such as x86. For others CPU, the compiler will be more cautious, and
+ * insert extra code to ensure aligned access is respected. If you know
+ * your target CPU supports unaligned memory access, you may want to
+ * force this option manually to improve performance
+ */
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+/*
+ * Illumos : we can't use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ * Linux : we can use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ */
+#undef LZ4_FORCE_SW_BITCOUNT
+#if defined(__sparc)
+#define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+/*
+ * Compiler Options
+ */
+/* Disable restrict */
+#define restrict
+
+/*
+ * Linux : GCC_VERSION is defined as of 3.9-rc1, so undefine it.
+ * torvalds/linux@3f3f8d2f48acfd8ed3b8e6b7377935da57b27b16
+ */
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#define expect(expr, value) (__builtin_expect((expr), (value)))
+#else
+#define expect(expr, value) (expr)
+#endif
+
+#ifndef likely
+#define likely(expr) expect((expr) != 0, 1)
+#endif
+
+#ifndef unlikely
+#define unlikely(expr) expect((expr) != 0, 0)
+#endif
+
+#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
+ (((x) & 0xffu) << 8)))
+
+/* Basic types */
+#define BYTE uint8_t
+#define U16 uint16_t
+#define U32 uint32_t
+#define S32 int32_t
+#define U64 uint64_t
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack(1)
+#endif
+
+typedef struct _U16_S {
+ U16 v;
+} U16_S;
+typedef struct _U32_S {
+ U32 v;
+} U32_S;
+typedef struct _U64_S {
+ U64 v;
+} U64_S;
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack()
+#endif
+
+#define A64(x) (((U64_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A16(x) (((U16_S *)(x))->v)
+
+/*
+ * Constants
+ */
+#define MINMATCH 4
+
+#define HASH_LOG COMPRESSIONLEVEL
+#define HASHTABLESIZE (1 << HASH_LOG)
+#define HASH_MASK (HASHTABLESIZE - 1)
+
+#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
+ NOTCOMPRESSIBLE_CONFIRMATION : 2)
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH + MINMATCH)
+#define MINLENGTH (MFLIMIT + 1)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS 4
+#define ML_MASK ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*
+ * Architecture-specific macros
+ */
+#if LZ4_ARCH64
+#define STEPSIZE 8
+#define UARCH U64
+#define AARCH A64
+#define LZ4_COPYSTEP(s, d) A64(d) = A64(s); d += 8; s += 8;
+#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d)
+#define LZ4_SECURECOPY(s, d, e) if (d < e) LZ4_WILDCOPY(s, d, e)
+#define HTYPE U32
+#define INITBASE(base) const BYTE* const base = ip
+#else /* !LZ4_ARCH64 */
+#define STEPSIZE 4
+#define UARCH U32
+#define AARCH A32
+#define LZ4_COPYSTEP(s, d) A32(d) = A32(s); d += 4; s += 4;
+#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
+#define LZ4_SECURECOPY LZ4_WILDCOPY
+#define HTYPE const BYTE *
+#define INITBASE(base) const int base = 0
+#endif /* !LZ4_ARCH64 */
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+ { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#define LZ4_WRITE_LITTLEENDIAN_16(p, i) \
+ { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#else
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v) { A16(p) = v; p += 2; }
+#endif
+
+
+/* Local structures */
+struct refTables {
+ HTYPE hashTable[HASHTABLESIZE];
+};
+
+
+/* Macros */
+#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
+ HASH_LOG))
+#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
+#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
+#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
+ d = e; }
+
+
+/* Private functions */
+#if LZ4_ARCH64
+
+static inline int
+LZ4_NbCommonBytes(register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clzll(val) >> 3);
+#else
+ int r;
+ if (!(val >> 32)) {
+ r = 4;
+ } else {
+ r = 0;
+ val >>= 32;
+ }
+ if (!(val >> 16)) {
+ r += 2;
+ val >>= 8;
+ } else {
+ val >>= 24;
+ }
+ r += (!val);
+ return (r);
+#endif
+#else
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctzll(val) >> 3);
+#else
+ static const int DeBruijnBytePos[64] =
+ { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
+ 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
+ 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
+ 4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+ };
+ return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
+ 58];
+#endif
+#endif
+}
+
+#else
+
+static inline int
+LZ4_NbCommonBytes(register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clz(val) >> 3);
+#else
+ int r;
+ if (!(val >> 16)) {
+ r = 2;
+ val >>= 8;
+ } else {
+ r = 0;
+ val >>= 24;
+ }
+ r += (!val);
+ return (r);
+#endif
+#else
+#if defined(__GNUC__) && (GCC_VERSION >= 304) && \
+ !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctz(val) >> 3);
+#else
+ static const int DeBruijnBytePos[32] = {
+ 0, 0, 3, 0, 3, 1, 3, 0,
+ 3, 2, 2, 1, 3, 2, 0, 1,
+ 3, 3, 1, 2, 2, 2, 2, 0,
+ 3, 1, 2, 0, 1, 0, 1, 1
+ };
+ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
+ 27];
+#endif
+#endif
+}
+
+#endif
+
+/* Compression functions */
+
+static int
+LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
+ int osize)
+{
+ struct refTables *srt = (struct refTables *)ctx;
+ HTYPE *HashTable = (HTYPE *) (srt->hashTable);
+
+ const BYTE *ip = (BYTE *) source;
+ INITBASE(base);
+ const BYTE *anchor = ip;
+ const BYTE *const iend = ip + isize;
+ const BYTE *const oend = (BYTE *) dest + osize;
+ const BYTE *const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+ BYTE *op = (BYTE *) dest;
+
+ int len, length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ /* First Byte */
+ HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+ ip++;
+ forwardH = LZ4_HASH_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE *forwardIp = ip;
+ const BYTE *ref;
+ BYTE *token;
+
+ /* Find a match */
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if (unlikely(forwardIp > mflimit)) {
+ goto _last_literals;
+ }
+
+ forwardH = LZ4_HASH_VALUE(forwardIp);
+ ref = base + HashTable[h];
+ HashTable[h] = ip - base;
+
+ } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (BYTE *) source) &&
+ unlikely(ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = ip - anchor;
+ token = op++;
+
+ /* Check output limit */
+ if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+ (length >> 8) > oend))
+ return (0);
+
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254; len -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+
+ _next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+ /* Start Counting */
+ ip += MINMATCH;
+ ref += MINMATCH; /* MinMatch verified */
+ anchor = ip;
+ while (likely(ip < matchlimit - (STEPSIZE - 1))) {
+ UARCH diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+#if LZ4_ARCH64
+ if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+#endif
+ if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < matchlimit) && (*ref == *ip))
+ ip++;
+ _endCount:
+
+ /* Encode MatchLength */
+ len = (ip - anchor);
+ /* Check output limit */
+ if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+ return (0);
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509; len -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *op++ = 255;
+ }
+ *op++ = (BYTE)len;
+ } else
+ *token += len;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+ /* Fill table */
+ HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
+
+ /* Test next position */
+ ref = base + HashTable[LZ4_HASH_VALUE(ip)];
+ HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+ if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardH = LZ4_HASH_VALUE(ip);
+ }
+
+ _last_literals:
+ /* Encode Last Literals */
+ {
+ int lastRun = iend - anchor;
+ if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+ oend)
+ return (0);
+ if (lastRun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastRun -= RUN_MASK;
+ for (; lastRun > 254; lastRun -= 255) {
+ *op++ = 255;
+ }
+ *op++ = (BYTE)lastRun;
+ } else
+ *op++ = (lastRun << ML_BITS);
+ (void) memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ }
+
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+
+
+/* Note : this function is valid only if isize < LZ4_64KLIMIT */
+#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
+#define HASHLOG64K (HASH_LOG + 1)
+#define HASH64KTABLESIZE (1U << HASHLOG64K)
+#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \
+ HASHLOG64K))
+#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p))
+
+static int
+LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
+ int osize)
+{
+ struct refTables *srt = (struct refTables *)ctx;
+ U16 *HashTable = (U16 *) (srt->hashTable);
+
+ const BYTE *ip = (BYTE *) source;
+ const BYTE *anchor = ip;
+ const BYTE *const base = ip;
+ const BYTE *const iend = ip + isize;
+ const BYTE *const oend = (BYTE *) dest + osize;
+ const BYTE *const mflimit = iend - MFLIMIT;
+#define matchlimit (iend - LASTLITERALS)
+
+ BYTE *op = (BYTE *) dest;
+
+ int len, length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+ /* Init */
+ if (isize < MINLENGTH)
+ goto _last_literals;
+
+ /* First Byte */
+ ip++;
+ forwardH = LZ4_HASH64K_VALUE(ip);
+
+ /* Main Loop */
+ for (;;) {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE *forwardIp = ip;
+ const BYTE *ref;
+ BYTE *token;
+
+ /* Find a match */
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if (forwardIp > mflimit) {
+ goto _last_literals;
+ }
+
+ forwardH = LZ4_HASH64K_VALUE(forwardIp);
+ ref = base + HashTable[h];
+ HashTable[h] = ip - base;
+
+ } while (A32(ref) != A32(ip));
+
+ /* Catch up */
+ while ((ip > anchor) && (ref > (BYTE *) source) &&
+ (ip[-1] == ref[-1])) {
+ ip--;
+ ref--;
+ }
+
+ /* Encode Literal length */
+ length = ip - anchor;
+ token = op++;
+
+ /* Check output limit */
+ if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+ (length >> 8) > oend))
+ return (0);
+
+ if (length >= (int)RUN_MASK) {
+ *token = (RUN_MASK << ML_BITS);
+ len = length - RUN_MASK;
+ for (; len > 254; len -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)len;
+ } else
+ *token = (length << ML_BITS);
+
+ /* Copy Literals */
+ LZ4_BLINDCOPY(anchor, op, length);
+
+ _next_match:
+ /* Encode Offset */
+ LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+ /* Start Counting */
+ ip += MINMATCH;
+ ref += MINMATCH; /* MinMatch verified */
+ anchor = ip;
+ while (ip < matchlimit - (STEPSIZE - 1)) {
+ UARCH diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) {
+ ip += STEPSIZE;
+ ref += STEPSIZE;
+ continue;
+ }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+#if LZ4_ARCH64
+ if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+ ip += 4;
+ ref += 4;
+ }
+#endif
+ if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+ ip += 2;
+ ref += 2;
+ }
+ if ((ip < matchlimit) && (*ref == *ip))
+ ip++;
+ _endCount:
+
+ /* Encode MatchLength */
+ len = (ip - anchor);
+ /* Check output limit */
+ if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+ return (0);
+ if (len >= (int)ML_MASK) {
+ *token += ML_MASK;
+ len -= ML_MASK;
+ for (; len > 509; len -= 510) {
+ *op++ = 255;
+ *op++ = 255;
+ }
+ if (len > 254) {
+ len -= 255;
+ *op++ = 255;
+ }
+ *op++ = (BYTE)len;
+ } else
+ *token += len;
+
+ /* Test end of chunk */
+ if (ip > mflimit) {
+ anchor = ip;
+ break;
+ }
+ /* Fill table */
+ HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
+
+ /* Test next position */
+ ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
+ HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
+ if (A32(ref) == A32(ip)) {
+ token = op++;
+ *token = 0;
+ goto _next_match;
+ }
+ /* Prepare next loop */
+ anchor = ip++;
+ forwardH = LZ4_HASH64K_VALUE(ip);
+ }
+
+ _last_literals:
+ /* Encode Last Literals */
+ {
+ int lastRun = iend - anchor;
+ if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+ oend)
+ return (0);
+ if (lastRun >= (int)RUN_MASK) {
+ *op++ = (RUN_MASK << ML_BITS);
+ lastRun -= RUN_MASK;
+ for (; lastRun > 254; lastRun -= 255)
+ *op++ = 255;
+ *op++ = (BYTE)lastRun;
+ } else
+ *op++ = (lastRun << ML_BITS);
+ (void) memcpy(op, anchor, iend - anchor);
+ op += iend - anchor;
+ }
+
+ /* End */
+ return (int)(((char *)op) - dest);
+}
+
+static int
+real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+{
+ void *ctx;
+ int result;
+
+ ctx = lz4_alloc(KM_SLEEP);
+
+ /*
+ * out of kernel memory, gently fall through - this will disable
+ * compression in zio_compress_data
+ */
+ if (ctx == NULL)
+ return (0);
+
+ memset(ctx, 0, sizeof (struct refTables));
+
+ if (isize < LZ4_64KLIMIT)
+ result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
+ else
+ result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+
+ lz4_free(ctx);
+ return (result);
+}
+
+#ifdef __FreeBSD__
+/*
+ * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here.
+ * Should struct refTables get resized this may need to be revisited, hence
+ * compiler-time asserts.
+ */
+_Static_assert(sizeof(struct refTables) <= 16384,
+ "refTables too big for malloc");
+_Static_assert((sizeof(struct refTables) % 4096) == 0,
+ "refTables not a multiple of page size");
+#else
+#define ZFS_LZ4_USE_CACHE
+#endif
+
+#ifdef ZFS_LZ4_USE_CACHE
+static kmem_cache_t *lz4_cache;
+#endif
+
+#ifdef ZFS_LZ4_USE_CACHE
+void
+lz4_init(void)
+{
+ lz4_cache = kmem_cache_create("lz4_cache",
+ sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+lz4_fini(void)
+{
+ if (lz4_cache) {
+ kmem_cache_destroy(lz4_cache);
+ lz4_cache = NULL;
+ }
+}
+
+static void *
+lz4_alloc(int flags)
+{
+ ASSERT(lz4_cache != NULL);
+ return (kmem_cache_alloc(lz4_cache, flags));
+}
+
+static void
+lz4_free(void *ctx)
+{
+ kmem_cache_free(lz4_cache, ctx);
+}
+#else
+void
+lz4_init(void)
+{
+}
+
+void
+lz4_fini(void)
+{
+}
+
+static void *
+lz4_alloc(int flags)
+{
+ return (kmem_alloc(sizeof (struct refTables), flags));
+}
+
+static void
+lz4_free(void *ctx)
+{
+ kmem_free(ctx, sizeof (struct refTables));
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/lzjb.c b/sys/contrib/openzfs/module/zfs/lzjb.c
index a478e64c5141..a24f17e0fe74 100644
--- a/sys/contrib/openzfs/module/zfs/lzjb.c
+++ b/sys/contrib/openzfs/module/zfs/lzjb.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -45,10 +45,10 @@
#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
#define LEMPEL_SIZE 1024
-/*ARGSUSED*/
size_t
lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
{
+ (void) n;
uchar_t *src = s_start;
uchar_t *dst = d_start;
uchar_t *cpy;
@@ -100,10 +100,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
return (dst - (uchar_t *)d_start);
}
-/*ARGSUSED*/
int
lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
{
+ (void) s_len, (void) n;
uchar_t *src = s_start;
uchar_t *dst = d_start;
uchar_t *d_end = (uchar_t *)d_start + d_len;
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index d1fee70f004b..7170b5eefcea 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -40,23 +40,26 @@
#include <sys/zap.h>
#include <sys/btree.h>
-#define WITH_DF_BLOCK_ALLOCATOR
-
#define GANG_ALLOCATION(flags) \
((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
/*
* Metaslab granularity, in bytes. This is roughly similar to what would be
* referred to as the "stripe size" in traditional RAID arrays. In normal
- * operation, we will try to write this amount of data to a top-level vdev
- * before moving on to the next one.
+ * operation, we will try to write this amount of data to each disk before
+ * moving on to the next top-level vdev.
*/
-unsigned long metaslab_aliquot = 512 << 10;
+static uint64_t metaslab_aliquot = 1024 * 1024;
/*
* For testing, make some blocks above a certain size be gang blocks.
*/
-unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
+uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
+
+/*
+ * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
+ */
+uint_t metaslab_force_ganging_pct = 3;
/*
* In pools where the log space map feature is not enabled we touch
@@ -81,7 +84,7 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17);
* space map representation must be before we compact it on-disk.
* Values should be greater than or equal to 100.
*/
-int zfs_condense_pct = 200;
+uint_t zfs_condense_pct = 200;
/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
@@ -96,7 +99,7 @@ int zfs_condense_pct = 200;
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
* blocks.
*/
-int zfs_metaslab_condense_block_threshold = 4;
+static const int zfs_metaslab_condense_block_threshold = 4;
/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
@@ -111,7 +114,7 @@ int zfs_metaslab_condense_block_threshold = 4;
* eligible to allocate on any metaslab group. The default value of 0 means
* no metaslab group will be excluded based on this criterion.
*/
-int zfs_mg_noalloc_threshold = 0;
+static uint_t zfs_mg_noalloc_threshold = 0;
/*
* Metaslab groups are considered eligible for allocations if their
@@ -135,7 +138,7 @@ int zfs_mg_noalloc_threshold = 0;
* enough to avoid hitting the speed bump on pools that are being pushed
* to the edge.
*/
-int zfs_mg_fragmentation_threshold = 95;
+static uint_t zfs_mg_fragmentation_threshold = 95;
/*
* Allow metaslabs to keep their active state as long as their fragmentation
@@ -143,17 +146,17 @@ int zfs_mg_fragmentation_threshold = 95;
* active metaslab that exceeds this threshold will no longer keep its active
* status allowing better metaslabs to be selected.
*/
-int zfs_metaslab_fragmentation_threshold = 70;
+static uint_t zfs_metaslab_fragmentation_threshold = 70;
/*
* When set will load all metaslabs when pool is first opened.
*/
-int metaslab_debug_load = 0;
+int metaslab_debug_load = B_FALSE;
/*
* When set will prevent metaslabs from being unloaded.
*/
-int metaslab_debug_unload = 0;
+static int metaslab_debug_unload = B_FALSE;
/*
* Minimum size which forces the dynamic allocator to change
@@ -169,7 +172,7 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
* Once the space map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
-int metaslab_df_free_pct = 4;
+uint_t metaslab_df_free_pct = 4;
/*
* Maximum distance to search forward from the last offset. Without this
@@ -184,14 +187,14 @@ int metaslab_df_free_pct = 4;
* With the default setting of 16MB this is 16*1024 (with ashift=9) or
* 2048 (with ashift=12).
*/
-int metaslab_df_max_search = 16 * 1024 * 1024;
+static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
/*
* Forces the metaslab_block_picker function to search for at least this many
* segments forwards until giving up on finding a segment that the allocation
* will fit into.
*/
-uint32_t metaslab_min_search_count = 100;
+static const uint32_t metaslab_min_search_count = 100;
/*
* If we are not searching forward (due to metaslab_df_max_search,
@@ -200,12 +203,7 @@ uint32_t metaslab_min_search_count = 100;
* segment. If it is not set, we will use a segment of exactly the requested
* size (or larger).
*/
-int metaslab_df_use_largest_segment = B_FALSE;
-
-/*
- * Percentage of all cpus that can be used by the metaslab taskq.
- */
-int metaslab_load_pct = 50;
+static int metaslab_df_use_largest_segment = B_FALSE;
/*
* These tunables control how long a metaslab will remain loaded after the
@@ -215,56 +213,56 @@ int metaslab_load_pct = 50;
* unloaded sooner. These settings are intended to be generous -- to keep
* metaslabs loaded for a long time, reducing the rate of metaslab loading.
*/
-int metaslab_unload_delay = 32;
-int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
+static uint_t metaslab_unload_delay = 32;
+static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
/*
* Max number of metaslabs per group to preload.
*/
-int metaslab_preload_limit = 10;
+uint_t metaslab_preload_limit = 10;
/*
* Enable/disable preloading of metaslab.
*/
-int metaslab_preload_enabled = B_TRUE;
+static int metaslab_preload_enabled = B_TRUE;
/*
* Enable/disable fragmentation weighting on metaslabs.
*/
-int metaslab_fragmentation_factor_enabled = B_TRUE;
+static int metaslab_fragmentation_factor_enabled = B_TRUE;
/*
* Enable/disable lba weighting (i.e. outer tracks are given preference).
*/
-int metaslab_lba_weighting_enabled = B_TRUE;
+static int metaslab_lba_weighting_enabled = B_TRUE;
/*
* Enable/disable metaslab group biasing.
*/
-int metaslab_bias_enabled = B_TRUE;
+static int metaslab_bias_enabled = B_TRUE;
/*
* Enable/disable remapping of indirect DVAs to their concrete vdevs.
*/
-boolean_t zfs_remap_blkptr_enable = B_TRUE;
+static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
/*
* Enable/disable segment-based metaslab selection.
*/
-int zfs_metaslab_segment_weight_enabled = B_TRUE;
+static int zfs_metaslab_segment_weight_enabled = B_TRUE;
/*
* When using segment-based metaslab selection, we will continue
* allocating from the active metaslab until we have exhausted
* zfs_metaslab_switch_threshold of its buckets.
*/
-int zfs_metaslab_switch_threshold = 2;
+static int zfs_metaslab_switch_threshold = 2;
/*
* Internal switch to enable/disable the metaslab allocation tracing
* facility.
*/
-boolean_t metaslab_trace_enabled = B_FALSE;
+static const boolean_t metaslab_trace_enabled = B_FALSE;
/*
* Maximum entries that the metaslab allocation tracing facility will keep
@@ -274,32 +272,32 @@ boolean_t metaslab_trace_enabled = B_FALSE;
* to every exceed this value. In debug mode, the system will panic if this
* limit is ever reached allowing for further investigation.
*/
-uint64_t metaslab_trace_max_entries = 5000;
+static const uint64_t metaslab_trace_max_entries = 5000;
/*
* Maximum number of metaslabs per group that can be disabled
* simultaneously.
*/
-int max_disabled_ms = 3;
+static const int max_disabled_ms = 3;
/*
* Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
* To avoid 64-bit overflow, don't set above UINT32_MAX.
*/
-unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
/*
* Maximum percentage of memory to use on storing loaded metaslabs. If loading
* a metaslab would take it over this percentage, the oldest selected metaslab
* is automatically unloaded.
*/
-int zfs_metaslab_mem_limit = 25;
+static uint_t zfs_metaslab_mem_limit = 25;
/*
* Force the per-metaslab range trees to use 64-bit integers to store
* segments. Used for debugging purposes.
*/
-boolean_t zfs_metaslab_force_large_segs = B_FALSE;
+static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
/*
* By default we only store segments over a certain size in the size-sorted
@@ -308,7 +306,7 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE;
* improves load and unload times at the cost of causing us to use slightly
* larger segments than we would otherwise in some cases.
*/
-uint32_t metaslab_by_size_min_shift = 14;
+static const uint32_t metaslab_by_size_min_shift = 14;
/*
* If not set, we will first try normal allocation. If that fails then
@@ -321,7 +319,7 @@ uint32_t metaslab_by_size_min_shift = 14;
* allocation. If that fails we will do a "try hard" gang allocation. If
* that fails then we will have a multi-layer gang block.
*/
-int zfs_metaslab_try_hard_before_gang = B_FALSE;
+static int zfs_metaslab_try_hard_before_gang = B_FALSE;
/*
* When not trying hard, we only consider the best zfs_metaslab_find_max_tries
@@ -337,7 +335,7 @@ int zfs_metaslab_try_hard_before_gang = B_FALSE;
* subsequent metaslab has ms_max_size >60KB (but fewer segments in this
* bucket, and therefore a lower weight).
*/
-int zfs_metaslab_find_max_tries = 100;
+static uint_t zfs_metaslab_find_max_tries = 100;
static uint64_t metaslab_weight(metaslab_t *, boolean_t);
static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
@@ -370,7 +368,7 @@ static metaslab_stats_t metaslab_stats = {
atomic_inc_64(&metaslab_stats.stat.value.ui64);
-kstat_t *metaslab_ksp;
+static kstat_t *metaslab_ksp;
void
metaslab_stat_init(void)
@@ -406,7 +404,7 @@ metaslab_stat_fini(void)
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
+metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
{
metaslab_class_t *mc;
@@ -629,8 +627,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
* metaslabs. We report the expandable space in terms
* of the metaslab size since that's the unit of expansion.
*/
- space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
- 1ULL << tvd->vdev_ms_shift);
+ space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
+ 1ULL << tvd->vdev_ms_shift, uint64_t);
}
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
return (space);
@@ -640,8 +638,9 @@ void
metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
{
multilist_t *ml = &mc->mc_metaslab_txg_list;
+ hrtime_t now = gethrtime();
for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
- multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
metaslab_t *msp = multilist_sublist_head(mls);
multilist_sublist_unlock(mls);
while (msp != NULL) {
@@ -658,13 +657,15 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
i--;
break;
}
- mls = multilist_sublist_lock(ml, i);
+ mls = multilist_sublist_lock_idx(ml, i);
metaslab_t *next_msp = multilist_sublist_next(mls, msp);
multilist_sublist_unlock(mls);
if (txg >
msp->ms_selected_txg + metaslab_unload_delay &&
- gethrtime() > msp->ms_selected_time +
- (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
+ now > msp->ms_selected_time +
+ MSEC2NSEC(metaslab_unload_delay_ms) &&
+ (msp->ms_allocator == -1 ||
+ !metaslab_preload_enabled)) {
metaslab_evict(msp, txg);
} else {
/*
@@ -851,9 +852,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
}
- mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
- maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
-
return (mg);
}
@@ -869,7 +867,6 @@ metaslab_group_destroy(metaslab_group_t *mg)
*/
ASSERT(mg->mg_activation_count <= 0);
- taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
mutex_destroy(&mg->mg_ms_disabled_lock);
@@ -899,7 +896,8 @@ metaslab_group_activate(metaslab_group_t *mg)
if (++mg->mg_activation_count <= 0)
return;
- mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+ mg->mg_aliquot = metaslab_aliquot * MAX(1,
+ vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
metaslab_group_alloc_update(mg);
if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
@@ -959,7 +957,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
* allocations from taking place and any changes to the vdev tree.
*/
spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
- taskq_wait_outstanding(mg->mg_taskq, 0);
+ taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
metaslab_group_alloc_update(mg);
for (int i = 0; i < mg->mg_allocators; i++) {
@@ -1222,7 +1220,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
*/
static boolean_t
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
- uint64_t psize, int allocator, int d)
+ int flags, uint64_t psize, int allocator, int d)
{
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;
@@ -1267,6 +1265,15 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
return (B_FALSE);
/*
+ * Some allocations (e.g., those coming from device removal
+ * where the * allocations are not even counted in the
+ * metaslab * allocation queues) are allowed to bypass
+ * the throttle.
+ */
+ if (flags & METASLAB_DONT_THROTTLE)
+ return (B_TRUE);
+
+ /*
* Relax allocation throttling for ditto blocks. Due to
* random imbalances in allocation it tends to push copies
* to one vdev, that looks a bit better at the moment.
@@ -1277,7 +1284,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
/*
* If this metaslab group is below its qmax or it's
- * the only allocatable metasable group, then attempt
+ * the only allocatable metaslab group, then attempt
* to allocate from it.
*/
if (qdepth < qmax || mc->mc_alloc_groups == 1)
@@ -1332,6 +1339,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* Comparison function for the private size-ordered tree using 32-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
+__attribute__((always_inline)) inline
static int
metaslab_rangesize32_compare(const void *x1, const void *x2)
{
@@ -1342,16 +1350,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
- if (likely(cmp))
- return (cmp);
- return (TREE_CMP(r1->rs_start, r2->rs_start));
+ return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
/*
* Comparison function for the private size-ordered tree using 64-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
+__attribute__((always_inline)) inline
static int
metaslab_rangesize64_compare(const void *x1, const void *x2)
{
@@ -1362,11 +1369,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
- if (likely(cmp))
- return (cmp);
- return (TREE_CMP(r1->rs_start, r2->rs_start));
+ return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
+
typedef struct metaslab_rt_arg {
zfs_btree_t *mra_bt;
uint32_t mra_floor_shift;
@@ -1402,11 +1408,17 @@ metaslab_size_tree_full_load(range_tree_t *rt)
range_tree_walk(rt, metaslab_size_sorted_add, &arg);
}
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
+ range_seg32_t, metaslab_rangesize32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
+ range_seg64_t, metaslab_rangesize64_compare)
+
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
*/
-/* ARGSUSED */
static void
metaslab_rt_create(range_tree_t *rt, void *arg)
{
@@ -1415,26 +1427,29 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
size_t size;
int (*compare) (const void *, const void *);
+ bt_find_in_buf_f bt_find;
switch (rt->rt_type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = metaslab_rangesize32_compare;
+ bt_find = metaslab_rt_find_rangesize32_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = metaslab_rangesize64_compare;
+ bt_find = metaslab_rt_find_rangesize64_in_buf;
break;
default:
panic("Invalid range seg type %d", rt->rt_type);
}
- zfs_btree_create(size_tree, compare, size);
+ zfs_btree_create(size_tree, compare, bt_find, size);
mrap->mra_floor_shift = metaslab_by_size_min_shift;
}
-/* ARGSUSED */
static void
metaslab_rt_destroy(range_tree_t *rt, void *arg)
{
+ (void) rt;
metaslab_rt_arg_t *mrap = arg;
zfs_btree_t *size_tree = mrap->mra_bt;
@@ -1442,7 +1457,6 @@ metaslab_rt_destroy(range_tree_t *rt, void *arg)
kmem_free(mrap, sizeof (*mrap));
}
-/* ARGSUSED */
static void
metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
@@ -1450,27 +1464,25 @@ metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
zfs_btree_t *size_tree = mrap->mra_bt;
if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
- (1 << mrap->mra_floor_shift))
+ (1ULL << mrap->mra_floor_shift))
return;
zfs_btree_add(size_tree, rs);
}
-/* ARGSUSED */
static void
metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
metaslab_rt_arg_t *mrap = arg;
zfs_btree_t *size_tree = mrap->mra_bt;
- if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
+ if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
mrap->mra_floor_shift))
return;
zfs_btree_remove(size_tree, rs);
}
-/* ARGSUSED */
static void
metaslab_rt_vacate(range_tree_t *rt, void *arg)
{
@@ -1482,7 +1494,7 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg)
metaslab_rt_create(rt, arg);
}
-static range_tree_ops_t metaslab_rt_ops = {
+static const range_tree_ops_t metaslab_rt_ops = {
.rtop_create = metaslab_rt_create,
.rtop_destroy = metaslab_rt_destroy,
.rtop_add = metaslab_rt_add,
@@ -1602,9 +1614,6 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
return (rs);
}
-#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
- defined(WITH_CF_BLOCK_ALLOCATOR)
-
/*
* This is a helper function that can be used by the allocator to find a
* suitable block to allocate. This will search the specified B-tree looking
@@ -1639,9 +1648,74 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
*cursor = 0;
return (-1ULL);
}
-#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
-#if defined(WITH_DF_BLOCK_ALLOCATOR)
+static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
+static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
+static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
+metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
+static metaslab_ops_t metaslab_allocators[] = {
+ { "dynamic", metaslab_df_alloc },
+ { "cursor", metaslab_cf_alloc },
+ { "new-dynamic", metaslab_ndf_alloc },
+};
+
+static int
+spa_find_allocator_byname(const char *val)
+{
+ int a = ARRAY_SIZE(metaslab_allocators) - 1;
+ if (strcmp("new-dynamic", val) == 0)
+ return (-1); /* remove when ndf is working */
+ for (; a >= 0; a--) {
+ if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
+ return (a);
+ }
+ return (-1);
+}
+
+void
+spa_set_allocator(spa_t *spa, const char *allocator)
+{
+ int a = spa_find_allocator_byname(allocator);
+ if (a < 0) a = 0;
+ spa->spa_active_allocator = a;
+ zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name);
+}
+
+int
+spa_get_allocator(spa_t *spa)
+{
+ return (spa->spa_active_allocator);
+}
+
+#if defined(_KERNEL)
+int
+param_set_active_allocator_common(const char *val)
+{
+ char *p;
+
+ if (val == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if ((p = strchr(val, '\n')) != NULL)
+ *p = '\0';
+
+ int a = spa_find_allocator_byname(val);
+ if (a < 0)
+ return (SET_ERROR(EINVAL));
+
+ zfs_active_allocator = metaslab_allocators[a].msop_name;
+ return (0);
+}
+#endif
+
+metaslab_ops_t *
+metaslab_allocator(spa_t *spa)
+{
+ int allocator = spa_get_allocator(spa);
+ return (&metaslab_allocators[allocator]);
+}
+
/*
* ==========================================================================
* Dynamic Fit (df) block allocator
@@ -1675,7 +1749,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
uint64_t align = size & -size;
uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
range_tree_t *rt = msp->ms_allocatable;
- int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+ uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
uint64_t offset;
ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1716,14 +1790,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
return (offset);
}
-static metaslab_ops_t metaslab_df_ops = {
- metaslab_df_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
-#endif /* WITH_DF_BLOCK_ALLOCATOR */
-
-#if defined(WITH_CF_BLOCK_ALLOCATOR)
/*
* ==========================================================================
* Cursor fit block allocator -
@@ -1766,14 +1832,6 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
return (offset);
}
-static metaslab_ops_t metaslab_cf_ops = {
- metaslab_cf_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
-#endif /* WITH_CF_BLOCK_ALLOCATOR */
-
-#if defined(WITH_NDF_BLOCK_ALLOCATOR)
/*
* ==========================================================================
* New dynamic fit allocator -
@@ -1830,14 +1888,6 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
return (-1ULL);
}
-static metaslab_ops_t metaslab_ndf_ops = {
- metaslab_ndf_alloc
-};
-
-metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
-#endif /* WITH_NDF_BLOCK_ALLOCATOR */
-
-
/*
* ==========================================================================
* Metaslabs
@@ -1962,9 +2012,9 @@ metaslab_aux_histograms_clear(metaslab_t *msp)
*/
ASSERT(msp->ms_loaded);
- bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
for (int t = 0; t < TXG_DEFER_SIZE; t++)
- bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+ memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
}
static void
@@ -2054,13 +2104,13 @@ metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
*/
uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
if (defer_allowed) {
- bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+ memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
sizeof (msp->ms_synchist));
} else {
- bzero(msp->ms_deferhist[hist_index],
+ memset(msp->ms_deferhist[hist_index], 0,
sizeof (msp->ms_deferhist[hist_index]));
}
- bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
}
/*
@@ -2178,19 +2228,19 @@ metaslab_potentially_evict(metaslab_class_t *mc)
uint64_t allmem = arc_all_memory();
uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
- int tries = 0;
+ uint_t tries = 0;
for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
tries++) {
unsigned int idx = multilist_get_random_index(
&mc->mc_metaslab_txg_list);
multilist_sublist_t *mls =
- multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+ multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
metaslab_t *msp = multilist_sublist_head(mls);
multilist_sublist_unlock(mls);
while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
inuse * size) {
- VERIFY3P(mls, ==, multilist_sublist_lock(
+ VERIFY3P(mls, ==, multilist_sublist_lock_idx(
&mc->mc_metaslab_txg_list, idx));
ASSERT3U(idx, ==,
metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
@@ -2240,6 +2290,8 @@ metaslab_potentially_evict(metaslab_class_t *mc)
inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
}
}
+#else
+ (void) mc, (void) zfs_metaslab_mem_limit;
#endif
}
@@ -2758,7 +2810,8 @@ metaslab_fini_flush_data(metaslab_t *msp)
mutex_exit(&spa->spa_flushed_ms_lock);
spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
- spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+ spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
+ metaslab_unflushed_dirty(msp));
}
uint64_t
@@ -2857,7 +2910,7 @@ metaslab_fini(metaslab_t *msp)
* of the table. Since the fragmentation value is never stored on disk, it
* is possible to change these calculations in the future.
*/
-int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
100, /* 512B */
100, /* 1K */
98, /* 2K */
@@ -3192,6 +3245,15 @@ static boolean_t
metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
{
/*
+ * This case will usually but not always get caught by the checks below;
+ * metaslabs can be loaded by various means, including the trim and
+ * initialize code. Once that happens, without this check they are
+ * allocatable even before they finish their first txg sync.
+ */
+ if (unlikely(msp->ms_new))
+ return (B_FALSE);
+
+ /*
* If the metaslab is loaded, ms_max_size is definitive and we can use
* the fast check. If it's not, the ms_max_size is a lower bound (once
* set), and we should use the fast check as long as we're not in
@@ -3503,10 +3565,8 @@ metaslab_group_preload(metaslab_group_t *mg)
avl_tree_t *t = &mg->mg_metaslab_tree;
int m = 0;
- if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
- taskq_wait_outstanding(mg->mg_taskq, 0);
+ if (spa_shutting_down(spa) || !metaslab_preload_enabled)
return;
- }
mutex_enter(&mg->mg_lock);
@@ -3526,8 +3586,9 @@ metaslab_group_preload(metaslab_group_t *mg)
continue;
}
- VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
- msp, TQ_SLEEP) != TASKQID_INVALID);
+ VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
+ msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
+ != TASKQID_INVALID);
}
mutex_exit(&mg->mg_lock);
}
@@ -3558,7 +3619,7 @@ metaslab_should_condense(metaslab_t *msp)
{
space_map_t *sm = msp->ms_sm;
vdev_t *vd = msp->ms_group->mg_vd;
- uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+ uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
@@ -3736,50 +3797,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
metaslab_flush_update(msp, tx);
}
-/*
- * Called when the metaslab has been flushed (its own spacemap now reflects
- * all the contents of the pool-wide spacemap log). Updates the metaslab's
- * metadata and any pool-wide related log space map data (e.g. summary,
- * obsolete logs, etc..) to reflect that.
- */
static void
-metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
{
- metaslab_group_t *mg = msp->ms_group;
- spa_t *spa = mg->mg_vd->vdev_spa;
-
- ASSERT(MUTEX_HELD(&msp->ms_lock));
-
- ASSERT3U(spa_sync_pass(spa), ==, 1);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ ASSERT(spa_syncing_log_sm(spa) != NULL);
+ ASSERT(msp->ms_sm != NULL);
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
- /*
- * Just because a metaslab got flushed, that doesn't mean that
- * it will pass through metaslab_sync_done(). Thus, make sure to
- * update ms_synced_length here in case it doesn't.
- */
- msp->ms_synced_length = space_map_length(msp->ms_sm);
+ mutex_enter(&spa->spa_flushed_ms_lock);
+ metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ metaslab_set_unflushed_dirty(msp, B_TRUE);
+ avl_add(&spa->spa_metaslabs_by_flushed, msp);
+ mutex_exit(&spa->spa_flushed_ms_lock);
- /*
- * We may end up here from metaslab_condense() without the
- * feature being active. In that case this is a no-op.
- */
- if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
- return;
+ spa_log_sm_increment_current_mscount(spa);
+ spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
+}
+void
+metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
ASSERT(spa_syncing_log_sm(spa) != NULL);
ASSERT(msp->ms_sm != NULL);
ASSERT(metaslab_unflushed_txg(msp) != 0);
ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
/* update metaslab's position in our flushing tree */
uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+ boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
mutex_enter(&spa->spa_flushed_ms_lock);
avl_remove(&spa->spa_metaslabs_by_flushed, msp);
metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+ metaslab_set_unflushed_dirty(msp, dirty);
avl_add(&spa->spa_metaslabs_by_flushed, msp);
mutex_exit(&spa->spa_flushed_ms_lock);
@@ -3787,17 +3843,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
spa_log_sm_increment_current_mscount(spa);
+ /* update log space map summary */
+ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
+ ms_prev_flushed_dirty);
+ spa_log_summary_add_flushed_metaslab(spa, dirty);
+
/* cleanup obsolete logs if any */
- uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
spa_cleanup_old_sm_logs(spa, tx);
- uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
- VERIFY3U(log_blocks_after, <=, log_blocks_before);
+}
- /* update log space map summary */
- uint64_t blocks_gone = log_blocks_before - log_blocks_after;
- spa_log_summary_add_flushed_metaslab(spa);
- spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
- spa_log_summary_decrement_blkcount(spa, blocks_gone);
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+ metaslab_group_t *mg = msp->ms_group;
+ spa_t *spa = mg->mg_vd->vdev_spa;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ ASSERT3U(spa_sync_pass(spa), ==, 1);
+
+ /*
+ * Just because a metaslab got flushed, that doesn't mean that
+ * it will pass through metaslab_sync_done(). Thus, make sure to
+ * update ms_synced_length here in case it doesn't.
+ */
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+ /*
+ * We may end up here from metaslab_condense() without the
+ * feature being active. In that case this is a no-op.
+ */
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
+ metaslab_unflushed_txg(msp) == 0)
+ return;
+
+ metaslab_unflushed_bump(msp, tx, B_FALSE);
}
boolean_t
@@ -4013,23 +4099,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
ASSERT0(metaslab_allocated_space(msp));
}
- if (metaslab_unflushed_txg(msp) == 0 &&
- spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
- ASSERT(spa_syncing_log_sm(spa) != NULL);
-
- metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
- spa_log_sm_increment_current_mscount(spa);
- spa_log_summary_add_flushed_metaslab(spa);
-
- ASSERT(msp->ms_sm != NULL);
- mutex_enter(&spa->spa_flushed_ms_lock);
- avl_add(&spa->spa_metaslabs_by_flushed, msp);
- mutex_exit(&spa->spa_flushed_ms_lock);
-
- ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
- ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
- }
-
if (!range_tree_is_empty(msp->ms_checkpointing) &&
vd->vdev_checkpoint_sm == NULL) {
ASSERT(spa_has_checkpoint(spa));
@@ -4077,6 +4146,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_t *log_sm = spa_syncing_log_sm(spa);
if (log_sm != NULL) {
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+ if (metaslab_unflushed_txg(msp) == 0)
+ metaslab_unflushed_add(msp, tx);
+ else if (!metaslab_unflushed_dirty(msp))
+ metaslab_unflushed_bump(msp, tx, B_TRUE);
space_map_write(log_sm, alloctree, SM_ALLOC,
vd->vdev_id, tx);
@@ -4272,7 +4345,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
metaslab_class_get_alloc(spa_normal_class(spa));
- if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
+ if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
+ vd->vdev_rz_expanding) {
defer_allowed = B_FALSE;
}
@@ -4502,8 +4576,8 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
*/
static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
- int allocator)
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
+ int flags, int allocator)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
(flags & METASLAB_DONT_THROTTLE))
@@ -4536,8 +4610,8 @@ metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
}
void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
- int allocator, boolean_t io_complete)
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
+ int flags, int allocator, boolean_t io_complete)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
(flags & METASLAB_DONT_THROTTLE))
@@ -4554,7 +4628,7 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
}
void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
int allocator)
{
#ifdef ZFS_DEBUG
@@ -4580,6 +4654,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
ASSERT(MUTEX_HELD(&msp->ms_lock));
VERIFY(!msp->ms_condensing);
VERIFY0(msp->ms_disabled);
+ VERIFY0(msp->ms_new);
start = mc->mc_ops->msop_alloc(msp, size);
if (start != -1ULL) {
@@ -4634,7 +4709,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
if (msp == NULL)
msp = avl_nearest(t, idx, AVL_AFTER);
- int tries = 0;
+ uint_t tries = 0;
for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
int i;
@@ -4651,10 +4726,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
}
/*
- * If the selected metaslab is condensing or disabled,
- * skip it.
+ * If the selected metaslab is condensing or disabled, or
+ * hasn't gone through a metaslab_sync_done(), then skip it.
*/
- if (msp->ms_condensing || msp->ms_disabled > 0)
+ if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
continue;
*was_active = msp->ms_allocator != -1;
@@ -4721,7 +4796,6 @@ metaslab_active_mask_verify(metaslab_t *msp)
}
}
-/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
@@ -5034,7 +5108,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
int allocator, boolean_t try_hard)
{
uint64_t offset;
- ASSERT(mg->mg_initialized);
offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
dva, d, allocator, try_hard);
@@ -5073,7 +5146,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
zio_alloc_list_t *zal, int allocator)
{
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
- metaslab_group_t *mg, *fast_mg, *rotor;
+ metaslab_group_t *mg, *rotor;
vdev_t *vd;
boolean_t try_hard = B_FALSE;
@@ -5086,7 +5159,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* damage can result in extremely long reconstruction times. This
* will also test spilling from special to normal.
*/
- if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
+ if (psize >= metaslab_force_ganging &&
+ metaslab_force_ganging_pct > 0 &&
+ (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
allocator);
return (SET_ERROR(ENOSPC));
@@ -5126,8 +5201,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
if (vd != NULL && vd->vdev_mg != NULL) {
mg = vdev_get_mg(vd, mc);
- if (flags & METASLAB_HINTBP_AVOID &&
- mg->mg_next != NULL)
+ if (flags & METASLAB_HINTBP_AVOID)
mg = mg->mg_next;
} else {
mg = mca->mca_rotor;
@@ -5135,15 +5209,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
- } else if (flags & METASLAB_FASTWRITE) {
- mg = fast_mg = mca->mca_rotor;
-
- do {
- if (fast_mg->mg_vd->vdev_pending_fastwrite <
- mg->mg_vd->vdev_pending_fastwrite)
- mg = fast_mg;
- } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
-
} else {
ASSERT(mca->mca_rotor != NULL);
mg = mca->mca_rotor;
@@ -5184,7 +5249,7 @@ top:
*/
if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
allocatable = metaslab_group_allocatable(mg, rotor,
- psize, allocator, d);
+ flags, psize, allocator, d);
}
if (!allocatable) {
@@ -5193,15 +5258,12 @@ top:
goto next;
}
- ASSERT(mg->mg_initialized);
-
/*
- * Avoid writing single-copy data to a failing,
+ * Avoid writing single-copy data to an unhealthy,
* non-redundant vdev, unless we've already tried all
* other vdevs.
*/
- if ((vd->vdev_stat.vs_write_errors > 0 ||
- vd->vdev_state < VDEV_STATE_HEALTHY) &&
+ if (vd->vdev_state < VDEV_STATE_HEALTHY &&
d == 0 && !try_hard && vd->vdev_children == 0) {
metaslab_trace_add(zal, mg, NULL, psize, d,
TRACE_VDEV_ERROR, allocator);
@@ -5210,7 +5272,7 @@ top:
ASSERT(mg->mg_class == mc);
- uint64_t asize = vdev_psize_to_asize(vd, psize);
+ uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
/*
@@ -5269,7 +5331,7 @@ top:
mg->mg_bias = 0;
}
- if ((flags & METASLAB_FASTWRITE) ||
+ if ((flags & METASLAB_ZIL) ||
atomic_add_64_nv(&mca->mca_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
mca->mca_rotor = mg->mg_next;
@@ -5282,11 +5344,6 @@ top:
((flags & METASLAB_GANG_HEADER) ? 1 : 0));
DVA_SET_ASIZE(&dva[d], asize);
- if (flags & METASLAB_FASTWRITE) {
- atomic_add_64(&vd->vdev_pending_fastwrite,
- psize);
- }
-
return (0);
}
next:
@@ -5305,7 +5362,7 @@ next:
goto top;
}
- bzero(&dva[d], sizeof (dva_t));
+ memset(&dva[d], 0, sizeof (dva_t));
metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
return (SET_ERROR(ENOSPC));
@@ -5347,11 +5404,11 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
mutex_exit(&msp->ms_lock);
}
-/* ARGSUSED */
void
metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
+ (void) inner_offset;
boolean_t *checkpoint = arg;
ASSERT3P(checkpoint, !=, NULL);
@@ -5441,8 +5498,9 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
DVA_GET_VDEV(&bp->blk_dva[0]));
vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
- bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+ uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+ BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@@ -5629,8 +5687,7 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
- for (int d = 0; d < slots; d++)
- zfs_refcount_add(&mca->mca_alloc_slots, zio);
+ zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
return (B_TRUE);
}
@@ -5644,8 +5701,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
- for (int d = 0; d < slots; d++)
- zfs_refcount_remove(&mca->mca_alloc_slots, zio);
+ zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
}
static int
@@ -5717,11 +5773,11 @@ typedef struct metaslab_claim_cb_arg_t {
int mcca_error;
} metaslab_claim_cb_arg_t;
-/* ARGSUSED */
static void
metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
+ (void) inner_offset;
metaslab_claim_cb_arg_t *mcca_arg = arg;
if (mcca_arg->mcca_error == 0) {
@@ -5793,8 +5849,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0;
- ASSERT(bp->blk_birth == 0);
- ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+ ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
+ ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
@@ -5818,7 +5874,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
metaslab_group_alloc_decrement(spa,
DVA_GET_VDEV(&dva[d]), zio, flags,
allocator, B_FALSE);
- bzero(&dva[d], sizeof (dva_t));
+ memset(&dva[d], 0, sizeof (dva_t));
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (error);
@@ -5848,7 +5904,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+ ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
/*
* If we have a checkpoint for the pool we need to make sure that
@@ -5866,7 +5922,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
* normally as they will be referenced by the checkpointed uberblock.
*/
boolean_t checkpoint = B_FALSE;
- if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+ if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
/*
* At this point, if the block is part of the checkpoint
@@ -5924,60 +5980,12 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
return (error);
}
-void
-metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
-{
- const dva_t *dva = bp->blk_dva;
- int ndvas = BP_GET_NDVAS(bp);
- uint64_t psize = BP_GET_PSIZE(bp);
- int d;
- vdev_t *vd;
-
- ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!BP_IS_EMBEDDED(bp));
- ASSERT(psize > 0);
-
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
- for (d = 0; d < ndvas; d++) {
- if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
- continue;
- atomic_add_64(&vd->vdev_pending_fastwrite, psize);
- }
-
- spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
-void
-metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
-{
- const dva_t *dva = bp->blk_dva;
- int ndvas = BP_GET_NDVAS(bp);
- uint64_t psize = BP_GET_PSIZE(bp);
- int d;
- vdev_t *vd;
-
- ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!BP_IS_EMBEDDED(bp));
- ASSERT(psize > 0);
-
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-
- for (d = 0; d < ndvas; d++) {
- if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
- continue;
- ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
- atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
- }
-
- spa_config_exit(spa, SCL_VDEV, FTAG);
-}
-
-/* ARGSUSED */
static void
metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
+ (void) inner, (void) arg;
+
if (vd->vdev_ops == &vdev_indirect_ops)
return;
@@ -6139,6 +6147,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
mutex_exit(&mg->mg_ms_disabled_lock);
}
+void
+metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
+{
+ ms->ms_unflushed_dirty = dirty;
+}
+
static void
metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
{
@@ -6175,22 +6189,23 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
void
metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
{
- spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
-
- if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
- return;
-
ms->ms_unflushed_txg = txg;
metaslab_update_ondisk_flush_data(ms, tx);
}
+boolean_t
+metaslab_unflushed_dirty(metaslab_t *ms)
+{
+ return (ms->ms_unflushed_dirty);
+}
+
uint64_t
metaslab_unflushed_txg(metaslab_t *ms)
{
return (ms->ms_unflushed_txg);
}
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
"Allocation granularity (a.k.a. stripe size)");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
@@ -6202,29 +6217,33 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
"Preload potential metaslabs during reassessment");
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
+ "Max number of metaslabs per group to preload");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
"Delay in txgs after metaslab was last used before unloading");
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
"Delay in milliseconds after metaslab was last used before unloading");
/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
"Percentage of metaslab group size that should be free to make it "
"eligible for allocation");
-ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
"Percentage of metaslab group size that should be considered eligible "
"for allocations unless all metaslab groups within the metaslab class "
"have also crossed this threshold");
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT,
- ZMOD_RW, "Fragmentation for metaslab to allow allocation");
-
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
+ ZMOD_RW,
"Use the fragmentation metric to prefer less fragmented metaslabs");
/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
+ ZMOD_RW, "Fragmentation for metaslab to allow allocation");
+
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
"Prefer metaslabs with lower LBAs");
@@ -6237,23 +6256,32 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
"Segment-based metaslab selection maximum buckets before switching");
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW,
- "Blocks larger than this size are forced to be gang blocks");
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
+ "Blocks larger than this size are sometimes forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
+ "Percentage of large blocks that will be forced to be gang blocks");
-ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
"Max distance (bytes) to search forward before using size tree");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
"When looking in size tree, use largest segment instead of exact fit");
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG,
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
"Percentage of memory that can be used to store metaslab range trees");
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
ZMOD_RW, "Try hard to allocate before ganging");
-ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
"Normally only consider this many of the best metaslabs in each vdev");
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
+ param_set_active_allocator, param_get_charp, ZMOD_RW,
+ "SPA active allocator");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index f67a4eb22a2d..71122542758d 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -156,7 +156,7 @@
* vary with the I/O load and this observed value is the ub_mmp_delay which is
* stored in the uberblock. The minimum allowed value is 100 ms.
*/
-ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
+uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
/*
* Used to control the duration of the activity test on import. Smaller values
@@ -186,8 +186,8 @@ uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
*/
uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
-char *mmp_tag = "mmp_write_uberblock";
-static void mmp_thread(void *arg);
+static const void *const mmp_tag = "mmp_write_uberblock";
+static __attribute__((noreturn)) void mmp_thread(void *arg);
void
mmp_init(spa_t *spa)
@@ -224,7 +224,6 @@ mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
*mpp = NULL;
cv_broadcast(&mmp->mmp_thread_cv);
CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */
- thread_exit();
}
void
@@ -304,8 +303,10 @@ mmp_next_leaf(spa_t *spa)
do {
leaf = list_next(&spa->spa_leaf_list, leaf);
- if (leaf == NULL)
+ if (leaf == NULL) {
leaf = list_head(&spa->spa_leaf_list);
+ ASSERT3P(leaf, !=, NULL);
+ }
/*
* We skip unwritable, offline, detached, and dRAID spare
@@ -444,7 +445,7 @@ mmp_write_uberblock(spa_t *spa)
uint64_t offset;
hrtime_t lock_acquire_time = gethrtime();
- spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
+ spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
lock_acquire_time = gethrtime() - lock_acquire_time;
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
@@ -516,8 +517,9 @@ mmp_write_uberblock(spa_t *spa)
zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
- abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+ abd_zero_off(ub_abd, sizeof (uberblock_t),
+ VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
mmp->mmp_seq++;
mmp->mmp_kstat_id++;
@@ -537,7 +539,7 @@ mmp_write_uberblock(spa_t *spa)
zio_nowait(zio);
}
-static void
+static __attribute__((noreturn)) void
mmp_thread(void *arg)
{
spa_t *spa = (spa_t *)arg;
@@ -549,11 +551,11 @@ mmp_thread(void *arg)
uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
zfs_multihost_fail_intervals);
hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
- boolean_t last_spa_suspended = suspended;
- boolean_t last_spa_multihost = multihost;
- uint64_t last_mmp_interval = mmp_interval;
- uint32_t last_mmp_fail_intervals = mmp_fail_intervals;
- hrtime_t last_mmp_fail_ns = mmp_fail_ns;
+ boolean_t last_spa_suspended;
+ boolean_t last_spa_multihost;
+ uint64_t last_mmp_interval;
+ uint32_t last_mmp_fail_intervals;
+ hrtime_t last_mmp_fail_ns;
callb_cpr_t cpr;
int skip_wait = 0;
@@ -662,12 +664,13 @@ mmp_thread(void *arg)
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
- "mmp_fail_intervals %llu mmp_fail_ns %llu",
+ "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,
(u_longlong_t)mmp_fail_intervals,
- (u_longlong_t)mmp_fail_ns);
+ (u_longlong_t)mmp_fail_ns,
+ (u_longlong_t)spa->spa_uberblock.ub_txg);
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llu ms; suspending pool. "
"Hrtime %llu",
@@ -698,6 +701,8 @@ mmp_thread(void *arg)
mmp->mmp_zio_root = NULL;
mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
+
+ thread_exit();
}
/*
@@ -733,7 +738,7 @@ mmp_signal_all_threads(void)
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
- param_set_multihost_interval, param_get_ulong, ZMOD_RW,
+ param_set_multihost_interval, spl_param_get_u64, ZMOD_RW,
"Milliseconds between mmp writes to each leaf");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
index 8bbc9b376ae0..3d3ef86e6839 100644
--- a/sys/contrib/openzfs/module/zfs/multilist.c
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -24,7 +24,7 @@
* This overrides the number of sublists in each multilist_t, which defaults
* to the number of CPUs in the system (see multilist_create()).
*/
-int zfs_multilist_num_sublists = 0;
+uint_t zfs_multilist_num_sublists = 0;
/*
* Given the object contained on the list, return a pointer to the
@@ -36,6 +36,8 @@ multilist_d2l(multilist_t *ml, void *obj)
{
return ((multilist_node_t *)((char *)obj + ml->ml_offset));
}
+#else
+#define multilist_d2l(ml, obj) ((void) sizeof (ml), (void) sizeof (obj), NULL)
#endif
/*
@@ -67,7 +69,7 @@ multilist_d2l(multilist_t *ml, void *obj)
*/
static void
multilist_create_impl(multilist_t *ml, size_t size, size_t offset,
- unsigned int num, multilist_sublist_index_func_t *index_func)
+ uint_t num, multilist_sublist_index_func_t *index_func)
{
ASSERT3U(size, >, 0);
ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
@@ -102,7 +104,7 @@ void
multilist_create(multilist_t *ml, size_t size, size_t offset,
multilist_sublist_index_func_t *index_func)
{
- int num_sublists;
+ uint_t num_sublists;
if (zfs_multilist_num_sublists > 0) {
num_sublists = zfs_multilist_num_sublists;
@@ -275,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
return (random_in_range(ml->ml_num_sublists));
}
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+ mutex_enter(&mls->mls_lock);
+}
+
/* Lock and return the sublist specified at the given index */
multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
{
multilist_sublist_t *mls;
@@ -292,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
multilist_sublist_t *
multilist_sublist_lock_obj(multilist_t *ml, void *obj)
{
- return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+ return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
}
void
@@ -325,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
list_insert_tail(&mls->mls_list, obj);
}
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+ ASSERT(MUTEX_HELD(&mls->mls_lock));
+ list_insert_before(&mls->mls_list, next, obj);
+}
+
/*
* Move the object one element forward in the list.
*
@@ -423,7 +447,5 @@ multilist_link_active(multilist_node_t *link)
return (list_link_active(link));
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, UINT, ZMOD_RW,
"Number of sublists used in each multilist");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/pathname.c b/sys/contrib/openzfs/module/zfs/pathname.c
index 84ab7b7e1111..51460d119106 100644
--- a/sys/contrib/openzfs/module/zfs/pathname.c
+++ b/sys/contrib/openzfs/module/zfs/pathname.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
index 595918e5a742..5174e2c46633 100644
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -78,7 +78,7 @@
static inline void
rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
{
- ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+ ASSERT3U(rt->rt_type, <, RANGE_SEG_NUM_TYPES);
size_t size = 0;
switch (rt->rt_type) {
case RANGE_SEG32:
@@ -91,9 +91,9 @@ rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
size = sizeof (range_seg_gap_t);
break;
default:
- VERIFY(0);
+ __builtin_unreachable();
}
- bcopy(src, dest, size);
+ memcpy(dest, src, size);
}
void
@@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
rt->rt_histogram[idx]--;
}
+__attribute__((always_inline)) inline
static int
range_tree_seg32_compare(const void *x1, const void *x2)
{
@@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+__attribute__((always_inline)) inline
static int
range_tree_seg64_compare(const void *x1, const void *x2)
{
@@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+__attribute__((always_inline)) inline
static int
range_tree_seg_gap_compare(const void *x1, const void *x2)
{
@@ -187,11 +190,18 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
+ range_tree_seg32_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
+ range_tree_seg64_compare)
+
+ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
+ range_tree_seg_gap_compare)
+
range_tree_t *
-range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
- uint64_t start, uint64_t shift,
- int (*zfs_btree_compare) (const void *, const void *),
- uint64_t gap)
+range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
+ void *arg, uint64_t start, uint64_t shift, uint64_t gap)
{
range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
@@ -199,23 +209,27 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
size_t size;
int (*compare) (const void *, const void *);
+ bt_find_in_buf_f bt_find;
switch (type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = range_tree_seg32_compare;
+ bt_find = range_tree_seg32_find_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = range_tree_seg64_compare;
+ bt_find = range_tree_seg64_find_in_buf;
break;
case RANGE_SEG_GAP:
size = sizeof (range_seg_gap_t);
compare = range_tree_seg_gap_compare;
+ bt_find = range_tree_seg_gap_find_in_buf;
break;
default:
panic("Invalid range seg type %d", type);
}
- zfs_btree_create(&rt->rt_root, compare, size);
+ zfs_btree_create(&rt->rt_root, compare, bt_find, size);
rt->rt_ops = ops;
rt->rt_gap = gap;
@@ -223,7 +237,6 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
rt->rt_type = type;
rt->rt_start = start;
rt->rt_shift = shift;
- rt->rt_btree_compare = zfs_btree_compare;
if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg);
@@ -232,10 +245,10 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
}
range_tree_t *
-range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type,
void *arg, uint64_t start, uint64_t shift)
{
- return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0));
+ return (range_tree_create_gap(ops, type, arg, start, shift, 0));
}
void
@@ -372,6 +385,7 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
* invalid as soon as we do any mutating btree operations.
*/
rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
+ ASSERT3P(rs_after, !=, NULL);
rs_set_start_raw(rs_after, rt, before_start);
rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
rs = rs_after;
@@ -701,7 +715,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
zfs_btree_clear(&rt->rt_root);
}
- bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
+ memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
rt->rt_space = 0;
}
@@ -741,76 +755,6 @@ range_tree_is_empty(range_tree_t *rt)
return (range_tree_space(rt) == 0);
}
-/* ARGSUSED */
-void
-rt_btree_create(range_tree_t *rt, void *arg)
-{
- zfs_btree_t *size_tree = arg;
-
- size_t size;
- switch (rt->rt_type) {
- case RANGE_SEG32:
- size = sizeof (range_seg32_t);
- break;
- case RANGE_SEG64:
- size = sizeof (range_seg64_t);
- break;
- case RANGE_SEG_GAP:
- size = sizeof (range_seg_gap_t);
- break;
- default:
- panic("Invalid range seg type %d", rt->rt_type);
- }
- zfs_btree_create(size_tree, rt->rt_btree_compare, size);
-}
-
-/* ARGSUSED */
-void
-rt_btree_destroy(range_tree_t *rt, void *arg)
-{
- zfs_btree_t *size_tree = arg;
- ASSERT0(zfs_btree_numnodes(size_tree));
-
- zfs_btree_destroy(size_tree);
-}
-
-/* ARGSUSED */
-void
-rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
- zfs_btree_t *size_tree = arg;
-
- zfs_btree_add(size_tree, rs);
-}
-
-/* ARGSUSED */
-void
-rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
- zfs_btree_t *size_tree = arg;
-
- zfs_btree_remove(size_tree, rs);
-}
-
-/* ARGSUSED */
-void
-rt_btree_vacate(range_tree_t *rt, void *arg)
-{
- zfs_btree_t *size_tree = arg;
- zfs_btree_clear(size_tree);
- zfs_btree_destroy(size_tree);
-
- rt_btree_create(rt, arg);
-}
-
-range_tree_ops_t rt_btree_ops = {
- .rtop_create = rt_btree_create,
- .rtop_destroy = rt_btree_destroy,
- .rtop_add = rt_btree_add,
- .rtop_remove = rt_btree_remove,
- .rtop_vacate = rt_btree_vacate
-};
-
/*
* Remove any overlapping ranges between the given segment [start, end)
* from removefrom. Add non-overlapping leftovers to addto.
diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c
index 35a379dded69..718bbb34a8d5 100644
--- a/sys/contrib/openzfs/module/zfs/refcount.c
+++ b/sys/contrib/openzfs/module/zfs/refcount.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -26,43 +26,50 @@
#include <sys/zfs_context.h>
#include <sys/zfs_refcount.h>
+#ifdef ZFS_DEBUG
/*
* Reference count tracking is disabled by default. It's memory requirements
* are reasonable, however as implemented it consumes a significant amount of
* cpu time. Until its performance is improved it should be manually enabled.
*/
-int reference_tracking_enable = FALSE;
-int reference_history = 3; /* tunable */
+int reference_tracking_enable = B_FALSE;
+static uint_t reference_history = 3; /* tunable */
-#ifdef ZFS_DEBUG
static kmem_cache_t *reference_cache;
-static kmem_cache_t *reference_history_cache;
void
zfs_refcount_init(void)
{
reference_cache = kmem_cache_create("reference_cache",
sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
- reference_history_cache = kmem_cache_create("reference_history_cache",
- sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
}
void
zfs_refcount_fini(void)
{
kmem_cache_destroy(reference_cache);
- kmem_cache_destroy(reference_history_cache);
+}
+
+static int
+zfs_refcount_compare(const void *x1, const void *x2)
+{
+ const reference_t *r1 = (const reference_t *)x1;
+ const reference_t *r2 = (const reference_t *)x2;
+
+ int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder);
+ int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number);
+ int cmp = cmp1 ? cmp1 : cmp2;
+ return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2));
}
void
zfs_refcount_create(zfs_refcount_t *rc)
{
mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
- list_create(&rc->rc_list, sizeof (reference_t),
- offsetof(reference_t, ref_link));
+ avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t),
+ offsetof(reference_t, ref_link.a));
list_create(&rc->rc_removed, sizeof (reference_t),
- offsetof(reference_t, ref_link));
+ offsetof(reference_t, ref_link.l));
rc->rc_count = 0;
rc->rc_removed_count = 0;
rc->rc_tracked = reference_tracking_enable;
@@ -86,19 +93,15 @@ void
zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
{
reference_t *ref;
+ void *cookie = NULL;
ASSERT3U(rc->rc_count, ==, number);
- while ((ref = list_head(&rc->rc_list))) {
- list_remove(&rc->rc_list, ref);
+ while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL)
kmem_cache_free(reference_cache, ref);
- }
- list_destroy(&rc->rc_list);
+ avl_destroy(&rc->rc_tree);
- while ((ref = list_head(&rc->rc_removed))) {
- list_remove(&rc->rc_removed, ref);
- kmem_cache_free(reference_history_cache, ref->ref_removed);
+ while ((ref = list_remove_head(&rc->rc_removed)))
kmem_cache_free(reference_cache, ref);
- }
list_destroy(&rc->rc_removed);
mutex_destroy(&rc->rc_mtx);
}
@@ -124,10 +127,10 @@ zfs_refcount_count(zfs_refcount_t *rc)
int64_t
zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
{
- reference_t *ref = NULL;
+ reference_t *ref;
int64_t count;
- if (!rc->rc_tracked) {
+ if (likely(!rc->rc_tracked)) {
count = atomic_add_64_nv(&(rc)->rc_count, number);
ASSERT3U(count, >=, number);
return (count);
@@ -136,9 +139,9 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
ref->ref_holder = holder;
ref->ref_number = number;
+ ref->ref_search = B_FALSE;
mutex_enter(&rc->rc_mtx);
- ASSERT3U(rc->rc_count, >=, 0);
- list_insert_head(&rc->rc_list, ref);
+ avl_add(&rc->rc_tree, ref);
rc->rc_count += number;
count = rc->rc_count;
mutex_exit(&rc->rc_mtx);
@@ -152,51 +155,55 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
return (zfs_refcount_add_many(rc, 1, holder));
}
+void
+zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
+{
+ if (likely(!rc->rc_tracked))
+ (void) zfs_refcount_add_many(rc, number, holder);
+ else for (; number > 0; number--)
+ (void) zfs_refcount_add(rc, holder);
+}
+
int64_t
zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
const void *holder)
{
- reference_t *ref;
+ reference_t *ref, s;
int64_t count;
- if (!rc->rc_tracked) {
+ if (likely(!rc->rc_tracked)) {
count = atomic_add_64_nv(&(rc)->rc_count, -number);
ASSERT3S(count, >=, 0);
return (count);
}
+ s.ref_holder = holder;
+ s.ref_number = number;
+ s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
ASSERT3U(rc->rc_count, >=, number);
- for (ref = list_head(&rc->rc_list); ref;
- ref = list_next(&rc->rc_list, ref)) {
- if (ref->ref_holder == holder && ref->ref_number == number) {
- list_remove(&rc->rc_list, ref);
- if (reference_history > 0) {
- ref->ref_removed =
- kmem_cache_alloc(reference_history_cache,
- KM_SLEEP);
- list_insert_head(&rc->rc_removed, ref);
- rc->rc_removed_count++;
- if (rc->rc_removed_count > reference_history) {
- ref = list_tail(&rc->rc_removed);
- list_remove(&rc->rc_removed, ref);
- kmem_cache_free(reference_history_cache,
- ref->ref_removed);
- kmem_cache_free(reference_cache, ref);
- rc->rc_removed_count--;
- }
- } else {
- kmem_cache_free(reference_cache, ref);
- }
- rc->rc_count -= number;
- count = rc->rc_count;
- mutex_exit(&rc->rc_mtx);
- return (count);
+ ref = avl_find(&rc->rc_tree, &s, NULL);
+ if (unlikely(ref == NULL)) {
+ panic("No such hold %p on refcount %llx", holder,
+ (u_longlong_t)(uintptr_t)rc);
+ return (-1);
+ }
+ avl_remove(&rc->rc_tree, ref);
+ if (reference_history > 0) {
+ list_insert_head(&rc->rc_removed, ref);
+ if (rc->rc_removed_count >= reference_history) {
+ ref = list_remove_tail(&rc->rc_removed);
+ kmem_cache_free(reference_cache, ref);
+ } else {
+ rc->rc_removed_count++;
}
+ } else {
+ kmem_cache_free(reference_cache, ref);
}
- panic("No such hold %p on refcount %llx", holder,
- (u_longlong_t)(uintptr_t)rc);
- return (-1);
+ rc->rc_count -= number;
+ count = rc->rc_count;
+ mutex_exit(&rc->rc_mtx);
+ return (count);
}
int64_t
@@ -206,33 +213,49 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
}
void
+zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
+{
+ if (likely(!rc->rc_tracked))
+ (void) zfs_refcount_remove_many(rc, number, holder);
+ else for (; number > 0; number--)
+ (void) zfs_refcount_remove(rc, holder);
+}
+
+void
zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
{
- int64_t count, removed_count;
- list_t list, removed;
+ avl_tree_t tree;
+ list_t removed;
+ reference_t *ref;
+ void *cookie = NULL;
+ uint64_t count;
+ uint_t removed_count;
- list_create(&list, sizeof (reference_t),
- offsetof(reference_t, ref_link));
+ avl_create(&tree, zfs_refcount_compare, sizeof (reference_t),
+ offsetof(reference_t, ref_link.a));
list_create(&removed, sizeof (reference_t),
- offsetof(reference_t, ref_link));
+ offsetof(reference_t, ref_link.l));
mutex_enter(&src->rc_mtx);
count = src->rc_count;
removed_count = src->rc_removed_count;
src->rc_count = 0;
src->rc_removed_count = 0;
- list_move_tail(&list, &src->rc_list);
+ avl_swap(&tree, &src->rc_tree);
list_move_tail(&removed, &src->rc_removed);
mutex_exit(&src->rc_mtx);
mutex_enter(&dst->rc_mtx);
dst->rc_count += count;
dst->rc_removed_count += removed_count;
- list_move_tail(&dst->rc_list, &list);
+ if (avl_is_empty(&dst->rc_tree))
+ avl_swap(&dst->rc_tree, &tree);
+ else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL)
+ avl_add(&dst->rc_tree, ref);
list_move_tail(&dst->rc_removed, &removed);
mutex_exit(&dst->rc_mtx);
- list_destroy(&list);
+ avl_destroy(&tree);
list_destroy(&removed);
}
@@ -240,23 +263,19 @@ void
zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
const void *current_holder, const void *new_holder)
{
- reference_t *ref;
- boolean_t found = B_FALSE;
+ reference_t *ref, s;
- if (!rc->rc_tracked)
+ if (likely(!rc->rc_tracked))
return;
+ s.ref_holder = current_holder;
+ s.ref_number = number;
+ s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
- for (ref = list_head(&rc->rc_list); ref;
- ref = list_next(&rc->rc_list, ref)) {
- if (ref->ref_holder == current_holder &&
- ref->ref_number == number) {
- ref->ref_holder = new_holder;
- found = B_TRUE;
- break;
- }
- }
- ASSERT(found);
+ ref = avl_find(&rc->rc_tree, &s, NULL);
+ ASSERT(ref);
+ ref->ref_holder = new_holder;
+ avl_update(&rc->rc_tree, ref);
mutex_exit(&rc->rc_mtx);
}
@@ -276,21 +295,23 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
boolean_t
zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
{
- reference_t *ref;
+ reference_t *ref, s;
+ avl_index_t idx;
+ boolean_t res;
- if (!rc->rc_tracked)
+ if (likely(!rc->rc_tracked))
return (zfs_refcount_count(rc) > 0);
+ s.ref_holder = holder;
+ s.ref_number = 0;
+ s.ref_search = B_TRUE;
mutex_enter(&rc->rc_mtx);
- for (ref = list_head(&rc->rc_list); ref;
- ref = list_next(&rc->rc_list, ref)) {
- if (ref->ref_holder == holder) {
- mutex_exit(&rc->rc_mtx);
- return (B_TRUE);
- }
- }
+ ref = avl_find(&rc->rc_tree, &s, &idx);
+ if (likely(ref == NULL))
+ ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
+ res = ref && ref->ref_holder == holder;
mutex_exit(&rc->rc_mtx);
- return (B_FALSE);
+ return (res);
}
/*
@@ -301,21 +322,23 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
boolean_t
zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
{
- reference_t *ref;
+ reference_t *ref, s;
+ avl_index_t idx;
+ boolean_t res;
- if (!rc->rc_tracked)
+ if (likely(!rc->rc_tracked))
return (B_TRUE);
mutex_enter(&rc->rc_mtx);
- for (ref = list_head(&rc->rc_list); ref;
- ref = list_next(&rc->rc_list, ref)) {
- if (ref->ref_holder == holder) {
- mutex_exit(&rc->rc_mtx);
- return (B_FALSE);
- }
- }
+ s.ref_holder = holder;
+ s.ref_number = 0;
+ s.ref_search = B_TRUE;
+ ref = avl_find(&rc->rc_tree, &s, &idx);
+ if (likely(ref == NULL))
+ ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
+ res = ref == NULL || ref->ref_holder != holder;
mutex_exit(&rc->rc_mtx);
- return (B_TRUE);
+ return (res);
}
EXPORT_SYMBOL(zfs_refcount_create);
@@ -327,10 +350,10 @@ EXPORT_SYMBOL(zfs_refcount_remove);
EXPORT_SYMBOL(zfs_refcount_held);
/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, ,reference_tracking_enable, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW,
"Track reference holders to refcount_t objects");
-ZFS_MODULE_PARAM(zfs, ,reference_history, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW,
"Maximum reference holders being tracked");
/* END CSTYLED */
#endif /* ZFS_DEBUG */
diff --git a/sys/contrib/openzfs/module/zfs/rrwlock.c b/sys/contrib/openzfs/module/zfs/rrwlock.c
index d23fc3ad1067..a8c438bb6ebd 100644
--- a/sys/contrib/openzfs/module/zfs/rrwlock.c
+++ b/sys/contrib/openzfs/module/zfs/rrwlock.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -77,7 +77,7 @@ uint_t rrw_tsd_key;
typedef struct rrw_node {
struct rrw_node *rn_next;
rrwlock_t *rn_rrl;
- void *rn_tag;
+ const void *rn_tag;
} rrw_node_t;
static rrw_node_t *
@@ -99,7 +99,7 @@ rrn_find(rrwlock_t *rrl)
* Add a node to the head of the singly linked list.
*/
static void
-rrn_add(rrwlock_t *rrl, void *tag)
+rrn_add(rrwlock_t *rrl, const void *tag)
{
rrw_node_t *rn;
@@ -115,7 +115,7 @@ rrn_add(rrwlock_t *rrl, void *tag)
* thread's list and return TRUE; otherwise return FALSE.
*/
static boolean_t
-rrn_find_and_remove(rrwlock_t *rrl, void *tag)
+rrn_find_and_remove(rrwlock_t *rrl, const void *tag)
{
rrw_node_t *rn;
rrw_node_t *prev = NULL;
@@ -160,7 +160,7 @@ rrw_destroy(rrwlock_t *rrl)
}
static void
-rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, const void *tag)
{
mutex_enter(&rrl->rr_lock);
#if !defined(ZFS_DEBUG) && defined(_KERNEL)
@@ -192,7 +192,7 @@ rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
}
void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
+rrw_enter_read(rrwlock_t *rrl, const void *tag)
{
rrw_enter_read_impl(rrl, B_FALSE, tag);
}
@@ -204,7 +204,7 @@ rrw_enter_read(rrwlock_t *rrl, void *tag)
* the pending writer does not work, so we have to give an explicit hint here.
*/
void
-rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+rrw_enter_read_prio(rrwlock_t *rrl, const void *tag)
{
rrw_enter_read_impl(rrl, B_TRUE, tag);
}
@@ -228,7 +228,7 @@ rrw_enter_write(rrwlock_t *rrl)
}
void
-rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+rrw_enter(rrwlock_t *rrl, krw_t rw, const void *tag)
{
if (rw == RW_READER)
rrw_enter_read(rrl, tag);
@@ -237,7 +237,7 @@ rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
}
void
-rrw_exit(rrwlock_t *rrl, void *tag)
+rrw_exit(rrwlock_t *rrl, const void *tag)
{
mutex_enter(&rrl->rr_lock);
#if !defined(ZFS_DEBUG) && defined(_KERNEL)
@@ -339,7 +339,7 @@ rrm_destroy(rrmlock_t *rrl)
}
void
-rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+rrm_enter(rrmlock_t *rrl, krw_t rw, const void *tag)
{
if (rw == RW_READER)
rrm_enter_read(rrl, tag);
@@ -358,7 +358,7 @@ rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
void
-rrm_enter_read(rrmlock_t *rrl, void *tag)
+rrm_enter_read(rrmlock_t *rrl, const void *tag)
{
rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
}
@@ -373,7 +373,7 @@ rrm_enter_write(rrmlock_t *rrl)
}
void
-rrm_exit(rrmlock_t *rrl, void *tag)
+rrm_exit(rrmlock_t *rrl, const void *tag)
{
int i;
diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c
index 2604a7513ecf..0ae4c331dd36 100644
--- a/sys/contrib/openzfs/module/zfs/sa.c
+++ b/sys/contrib/openzfs/module/zfs/sa.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -23,6 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 RackTop Systems, Inc.
*/
#include <sys/zfs_context.h>
@@ -141,7 +142,7 @@ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
uint16_t buflen, dmu_tx_t *tx);
-arc_byteswap_func_t sa_bswap_table[] = {
+static arc_byteswap_func_t sa_bswap_table[] = {
byteswap_uint64_array,
byteswap_uint32_array,
byteswap_uint16_array,
@@ -160,7 +161,7 @@ do { \
*(uint64_t *)((uintptr_t)t + 8) = \
*(uint64_t *)((uintptr_t)s + 8); \
} else { \
- bcopy(s, t, l); \
+ memcpy(t, s, l); \
} \
} else { \
sa_copy_data(f, s, t, l); \
@@ -178,7 +179,7 @@ do { \
* won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will
* use this static table.
*/
-sa_attr_reg_t sa_legacy_attrs[] = {
+static const sa_attr_reg_t sa_legacy_attrs[] = {
{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
@@ -200,32 +201,32 @@ sa_attr_reg_t sa_legacy_attrs[] = {
/*
* This is only used for objects of type DMU_OT_ZNODE
*/
-sa_attr_type_t sa_legacy_zpl_layout[] = {
+static const sa_attr_type_t sa_legacy_zpl_layout[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
/*
* Special dummy layout used for buffers with no attributes.
*/
-sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+static const sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
-static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
+static const size_t sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
static kmem_cache_t *sa_cache = NULL;
-/*ARGSUSED*/
static int
sa_cache_constructor(void *buf, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
sa_handle_t *hdl = buf;
mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
-/*ARGSUSED*/
static void
sa_cache_destructor(void *buf, void *unused)
{
+ (void) unused;
sa_handle_t *hdl = buf;
mutex_destroy(&hdl->sa_lock);
}
@@ -285,12 +286,11 @@ sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
static uint64_t
-sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+sa_layout_info_hash(const sa_attr_type_t *attrs, int attr_count)
{
- int i;
uint64_t crc = -1ULL;
- for (i = 0; i != attr_count; i++)
+ for (int i = 0; i != attr_count; i++)
crc ^= SA_ATTR_HASH(attrs[i]);
return (crc);
@@ -370,7 +370,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
if (bulk[i].sa_data) {
SA_COPY_DATA(bulk[i].sa_data_func,
bulk[i].sa_addr, bulk[i].sa_data,
- bulk[i].sa_size);
+ MIN(bulk[i].sa_size, bulk[i].sa_length));
}
continue;
@@ -402,7 +402,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
}
static sa_lot_t *
-sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+sa_add_layout_entry(objset_t *os, const sa_attr_type_t *attrs, int attr_count,
uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
{
sa_os_t *sa = os->os_sa;
@@ -415,7 +415,7 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
tb->lot_attr_count = attr_count;
tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
KM_SLEEP);
- bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+ memcpy(tb->lot_attrs, attrs, sizeof (sa_attr_type_t) * attr_count);
tb->lot_num = lot_num;
tb->lot_hash = hash;
tb->lot_instance = 0;
@@ -512,7 +512,7 @@ static void
sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
{
if (func == NULL) {
- bcopy(datastart, target, buflen);
+ memcpy(target, datastart, buflen);
} else {
boolean_t start;
int bytes;
@@ -524,7 +524,7 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
bytes = 0;
while (bytes < buflen) {
func(&dataptr, &length, buflen, start, datastart);
- bcopy(dataptr, saptr, length);
+ memcpy(saptr, dataptr, length);
saptr = (void *)((caddr_t)saptr + length);
bytes += length;
start = B_FALSE;
@@ -831,7 +831,7 @@ sa_free_attr_table(sa_os_t *sa)
}
static int
-sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+sa_attr_table_setup(objset_t *os, const sa_attr_reg_t *reg_attrs, int count)
{
sa_os_t *sa = os->os_sa;
uint64_t sa_attr_count = 0;
@@ -992,8 +992,8 @@ bail:
}
int
-sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
- sa_attr_type_t **user_table)
+sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs,
+ int count, sa_attr_type_t **user_table)
{
zap_cursor_t zc;
zap_attribute_t za;
@@ -1069,8 +1069,8 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
za.za_num_integers);
break;
}
- VERIFY(ddi_strtoull(za.za_name, NULL, 10,
- (unsigned long long *)&lot_num) == 0);
+ VERIFY0(ddi_strtoull(za.za_name, NULL, 10,
+ (unsigned long long *)&lot_num));
(void) sa_add_layout_entry(os, lot_attrs,
za.za_num_integers, lot_num,
@@ -1202,6 +1202,7 @@ sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
uint8_t idx_len;
reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+ IMPLY(reg_length == 0, IS_SA_BONUSTYPE(type));
if (reg_length) {
attr_length = reg_length;
idx_len = 0;
@@ -1218,11 +1219,11 @@ sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
}
}
-/*ARGSUSED*/
static void
sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
uint16_t length, int length_idx, boolean_t variable_length, void *userp)
{
+ (void) hdr, (void) length_idx, (void) variable_length;
sa_handle_t *hdl = userp;
sa_os_t *sa = hdl->sa_os->os_sa;
@@ -1309,10 +1310,10 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
return (0);
}
-/*ARGSUSED*/
static void
sa_evict_sync(void *dbu)
{
+ (void) dbu;
panic("evicting sa dbuf\n");
}
@@ -1450,13 +1451,13 @@ sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
}
int
-sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+sa_buf_hold(objset_t *objset, uint64_t obj_num, const void *tag, dmu_buf_t **db)
{
return (dmu_bonus_hold(objset, obj_num, tag, db));
}
void
-sa_buf_rele(dmu_buf_t *db, void *tag)
+sa_buf_rele(dmu_buf_t *db, const void *tag)
{
dmu_buf_rele(db, tag);
}
@@ -1665,8 +1666,9 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
&xattr, 8);
if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
- bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
- scanstamp, AV_SCANSTAMP_SZ);
+ memcpy(scanstamp,
+ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ AV_SCANSTAMP_SZ);
SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
scanstamp, AV_SCANSTAMP_SZ);
zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
@@ -1874,7 +1876,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
if (dn->dn_bonuslen != 0) {
bonus_data_size = hdl->sa_bonus->db_size;
old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
- bcopy(hdl->sa_bonus->db_data, old_data[0],
+ memcpy(old_data[0], hdl->sa_bonus->db_data,
hdl->sa_bonus->db_size);
bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
} else {
@@ -1887,7 +1889,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
if ((error = sa_get_spill(hdl)) == 0) {
spill_data_size = hdl->sa_spill->db_size;
old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
- bcopy(hdl->sa_spill->db_data, old_data[1],
+ memcpy(old_data[1], hdl->sa_spill->db_data,
hdl->sa_spill->db_size);
spill_attr_count =
hdl->sa_spill_tab->sa_layout->lot_attr_count;
@@ -1917,7 +1919,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
count = bonus_attr_count;
hdr = SA_GET_HDR(hdl, SA_BONUS);
idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
- for (; k != 2; k++) {
+ for (; ; k++) {
/*
* Iterate over each attribute in layout. Fetch the
* size of variable-length attributes needing rewrite
diff --git a/sys/contrib/openzfs/module/zfs/sha256.c b/sys/contrib/openzfs/module/zfs/sha2_zfs.c
index d297768eada5..872b1e53ee66 100644
--- a/sys/contrib/openzfs/module/zfs/sha256.c
+++ b/sys/contrib/openzfs/module/zfs/sha2_zfs.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -18,16 +18,14 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
+
#include <sys/zfs_context.h>
-#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/sha2.h>
#include <sys/abd.h>
@@ -41,11 +39,11 @@ sha_incremental(void *buf, size_t size, void *arg)
return (0);
}
-/*ARGSUSED*/
void
-abd_checksum_SHA256(abd_t *abd, uint64_t size,
+abd_checksum_sha256(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ (void) ctx_template;
int ret;
SHA2_CTX ctx;
zio_cksum_t tmp;
@@ -78,11 +76,11 @@ bswap:
zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
}
-/*ARGSUSED*/
void
-abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
+abd_checksum_sha512_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ (void) ctx_template;
SHA2_CTX ctx;
SHA2Init(SHA512_256, &ctx);
@@ -90,14 +88,13 @@ abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
SHA2Final(zcp, &ctx);
}
-/*ARGSUSED*/
void
-abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
+abd_checksum_sha512_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
+ abd_checksum_sha512_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
diff --git a/sys/contrib/openzfs/module/zfs/skein_zfs.c b/sys/contrib/openzfs/module/zfs/skein_zfs.c
index 11b9940e027e..4b2aca888eee 100644
--- a/sys/contrib/openzfs/module/zfs/skein_zfs.c
+++ b/sys/contrib/openzfs/module/zfs/skein_zfs.c
@@ -41,18 +41,17 @@ skein_incremental(void *buf, size_t size, void *arg)
* function requires the presence of a ctx_template that should be allocated
* using abd_checksum_skein_tmpl_init.
*/
-/*ARGSUSED*/
void
abd_checksum_skein_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
- Skein_512_Ctxt_t ctx;
+ Skein_512_Ctxt_t ctx;
ASSERT(ctx_template != NULL);
- bcopy(ctx_template, &ctx, sizeof (ctx));
+ memcpy(&ctx, ctx_template, sizeof (ctx));
(void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
- bzero(&ctx, sizeof (ctx));
+ memset(&ctx, 0, sizeof (ctx));
}
/*
@@ -80,9 +79,8 @@ abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
void *
abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
{
- Skein_512_Ctxt_t *ctx;
+ Skein_512_Ctxt_t *ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
- ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
(void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
salt->zcs_bytes, sizeof (salt->zcs_bytes));
return (ctx);
@@ -95,8 +93,8 @@ abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
void
abd_checksum_skein_tmpl_free(void *ctx_template)
{
- Skein_512_Ctxt_t *ctx = ctx_template;
+ Skein_512_Ctxt_t *ctx = ctx_template;
- bzero(ctx, sizeof (*ctx));
+ memset(ctx, 0, sizeof (*ctx));
kmem_free(ctx, sizeof (*ctx));
}
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 7546e3e414f1..638572996c3a 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -33,6 +33,8 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2024, Klara Inc.
*/
/*
@@ -52,6 +54,7 @@
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_removal.h>
@@ -61,6 +64,7 @@
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
+#include <sys/vdev_raidz.h>
#include <sys/vdev_draid.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -81,7 +85,6 @@
#include <sys/arc.h>
#include <sys/callb.h>
#include <sys/systeminfo.h>
-#include <sys/spa_boot.h>
#include <sys/zfs_ioctl.h>
#include <sys/dsl_scan.h>
#include <sys/zfeature.h>
@@ -98,6 +101,27 @@
#include "zfs_prop.h"
#include "zfs_comutil.h"
+#include <cityhash.h>
+
+/*
+ * spa_thread() existed on Illumos as a parent thread for the various worker
+ * threads that actually run the pool, as a way to both reference the entire
+ * pool work as a single object, and to share properties like scheduling
+ * options. It has not yet been adapted to Linux or FreeBSD. This define is
+ * used to mark related parts of the code to make things easier for the reader,
+ * and to compile this code out. It can be removed when someone implements it,
+ * moves it to some Illumos-specific place, or removes it entirely.
+ */
+#undef HAVE_SPA_THREAD
+
+/*
+ * The "System Duty Cycle" scheduling class is an Illumos feature to help
+ * prevent CPU-intensive kernel threads from affecting latency on interactive
+ * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
+ * gated behind a define. On Illumos SDC depends on spa_thread(), but
+ * spa_thread() also has other uses, so this is a separate define.
+ */
+#undef HAVE_SYSDC
/*
* The interval, in seconds, at which failed configuration cache file writes
@@ -107,16 +131,16 @@ int zfs_ccw_retry_interval = 300;
typedef enum zti_modes {
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
- ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */
+ ZTI_MODE_SYNC, /* sync thread assigned */
ZTI_MODE_NULL, /* don't create a taskq */
ZTI_NMODES
} zti_modes_t;
#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
-#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 }
+#define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 }
#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
#define ZTI_N(n) ZTI_P(n, 1)
@@ -137,42 +161,60 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* initializing a pool, we use this table to create an appropriately sized
* taskq. Some operations are low volume and therefore have a small, static
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
- * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macros. Other operations process a large amount of data; the ZTI_SCALE
* macro causes us to create a taskq oriented for throughput. Some operations
* are so high frequency and short-lived that the taskq itself can become a
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the
- * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
- * but with number of taskqs also scaling with number of CPUs.
+ * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
+ * that scales with the number of CPUs.
*
* The different taskq priorities are to handle the different contexts (issue
- * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
- * need to be handled with minimum delay.
+ * and interrupt) and then to reserve threads for high priority I/Os that
+ * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT
+ * implementation, so separate high priority threads are used there.
*/
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
- { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
+#ifdef illumos
+ { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
+#else
+ { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */
+#endif
{ ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */
{ ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
};
static void spa_sync_version(void *arg, dmu_tx_t *tx);
static void spa_sync_props(void *arg, dmu_tx_t *tx);
static boolean_t spa_has_active_shared_spare(spa_t *spa);
-static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type,
+ const char **ereport);
static void spa_vdev_resilver_done(spa_t *spa);
-uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
-uint_t zio_taskq_batch_tpq; /* threads per taskq */
-boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
-uint_t zio_taskq_basedc = 80; /* base duty cycle */
+/*
+ * Percentage of all CPUs that can be used by the metaslab preload taskq.
+ */
+static uint_t metaslab_preload_pct = 50;
+
+static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
+static uint_t zio_taskq_batch_tpq; /* threads per taskq */
+
+#ifdef HAVE_SYSDC
+static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
+static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
+#endif
+
+#ifdef HAVE_SPA_THREAD
+static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
+#endif
-boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
+static uint_t zio_taskq_write_tpq = 16;
/*
* Report any spa_load_verify errors found, but do not fail spa_load.
@@ -195,7 +237,7 @@ boolean_t spa_mode_readable_spacemaps = B_FALSE;
/*
* For debugging purposes: print out vdev tree during pool import.
*/
-int spa_load_print_vdev_tree = B_FALSE;
+static int spa_load_print_vdev_tree = B_FALSE;
/*
* A non-zero value for zfs_max_missing_tvds means that we allow importing
@@ -218,7 +260,7 @@ int spa_load_print_vdev_tree = B_FALSE;
* there are also risks of performing an inadvertent rewind as we might be
* missing all the vdevs with the latest uberblocks.
*/
-unsigned long zfs_max_missing_tvds = 0;
+uint64_t zfs_max_missing_tvds = 0;
/*
* The parameters below are similar to zfs_max_missing_tvds but are only
@@ -244,28 +286,28 @@ uint64_t zfs_max_missing_tvds_scan = 0;
/*
* Debugging aid that pauses spa_sync() towards the end.
*/
-boolean_t zfs_pause_spa_sync = B_FALSE;
+static const boolean_t zfs_pause_spa_sync = B_FALSE;
/*
* Variables to indicate the livelist condense zthr func should wait at certain
* points for the livelist to be removed - used to test condense/destroy races
*/
-int zfs_livelist_condense_zthr_pause = 0;
-int zfs_livelist_condense_sync_pause = 0;
+static int zfs_livelist_condense_zthr_pause = 0;
+static int zfs_livelist_condense_sync_pause = 0;
/*
* Variables to track whether or not condense cancellation has been
* triggered in testing.
*/
-int zfs_livelist_condense_sync_cancel = 0;
-int zfs_livelist_condense_zthr_cancel = 0;
+static int zfs_livelist_condense_sync_cancel = 0;
+static int zfs_livelist_condense_zthr_cancel = 0;
/*
* Variable to track whether or not extra ALLOC blkptrs were added to a
* livelist entry while it was being condensed (caused by the way we track
* remapped blkptrs in dbuf_remap_impl)
*/
-int zfs_livelist_condense_new_alloc = 0;
+static int zfs_livelist_condense_new_alloc = 0;
/*
* ==========================================================================
@@ -277,7 +319,7 @@ int zfs_livelist_condense_new_alloc = 0;
* Add a (source=src, propname=propval) list to an nvlist.
*/
static void
-spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
uint64_t intval, zprop_source_t src)
{
const char *propname = zpool_prop_to_name(prop);
@@ -296,6 +338,22 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
}
/*
+ * Add a user property (source=src, propname=propval) to an nvlist.
+ */
+static void
+spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
+ zprop_source_t src)
+{
+ nvlist_t *propval;
+
+ VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+ VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+ VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+ nvlist_free(propval);
+}
+
+/*
* Get property values from the spa configuration.
*/
static void
@@ -341,6 +399,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
ddt_get_pool_dedup_ratio(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
+ brt_get_used(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
+ brt_get_saved(spa), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
+ brt_get_ratio(spa), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
rvd->vdev_state, src);
@@ -464,7 +528,8 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
zprop_source_t src = ZPROP_SRC_DEFAULT;
zpool_prop_t prop;
- if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
+ if ((prop = zpool_name_to_prop(za.za_name)) ==
+ ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name))
continue;
switch (za.za_integer_length) {
@@ -507,7 +572,13 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
kmem_free(strval, za.za_num_integers);
break;
}
- spa_prop_add_list(*nvp, prop, strval, 0, src);
+ if (prop != ZPOOL_PROP_INVAL) {
+ spa_prop_add_list(*nvp, prop, strval, 0, src);
+ } else {
+ src = ZPROP_SRC_LOCAL;
+ spa_prop_add_user(*nvp, za.za_name, strval,
+ src);
+ }
kmem_free(strval, za.za_num_integers);
break;
@@ -543,42 +614,53 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
elem = NULL;
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
uint64_t intval;
- char *strval, *slash, *check, *fname;
+ const char *strval, *slash, *check, *fname;
const char *propname = nvpair_name(elem);
zpool_prop_t prop = zpool_name_to_prop(propname);
switch (prop) {
case ZPOOL_PROP_INVAL:
- if (!zpool_prop_feature(propname)) {
- error = SET_ERROR(EINVAL);
- break;
- }
-
/*
* Sanitize the input.
*/
- if (nvpair_type(elem) != DATA_TYPE_UINT64) {
- error = SET_ERROR(EINVAL);
- break;
- }
+ if (zfs_prop_user(propname)) {
+ if (strlen(propname) >= ZAP_MAXNAMELEN) {
+ error = SET_ERROR(ENAMETOOLONG);
+ break;
+ }
- if (nvpair_value_uint64(elem, &intval) != 0) {
- error = SET_ERROR(EINVAL);
- break;
- }
+ if (strlen(fnvpair_value_string(elem)) >=
+ ZAP_MAXVALUELEN) {
+ error = SET_ERROR(E2BIG);
+ break;
+ }
+ } else if (zpool_prop_feature(propname)) {
+ if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
- if (intval != 0) {
- error = SET_ERROR(EINVAL);
- break;
- }
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ if (intval != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ fname = strchr(propname, '@') + 1;
+ if (zfeature_lookup_name(fname, NULL) != 0) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
- fname = strchr(propname, '@') + 1;
- if (zfeature_lookup_name(fname, NULL) != 0) {
+ has_feature = B_TRUE;
+ } else {
error = SET_ERROR(EINVAL);
break;
}
-
- has_feature = B_TRUE;
break;
case ZPOOL_PROP_VERSION:
@@ -745,7 +827,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
void
spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
{
- char *cachefile;
+ const char *cachefile;
spa_config_dirent_t *dp;
if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
@@ -785,8 +867,14 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp)
prop == ZPOOL_PROP_READONLY)
continue;
+ if (prop == ZPOOL_PROP_INVAL &&
+ zfs_prop_user(nvpair_name(elem))) {
+ need_sync = B_TRUE;
+ break;
+ }
+
if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
- uint64_t ver;
+ uint64_t ver = 0;
if (prop == ZPOOL_PROP_VERSION) {
VERIFY(nvpair_value_uint64(elem, &ver) == 0);
@@ -840,7 +928,6 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
}
}
-/*ARGSUSED*/
static int
spa_change_guid_check(void *arg, dmu_tx_t *tx)
{
@@ -910,7 +997,16 @@ spa_change_guid(spa_t *spa)
spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
if (error == 0) {
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ /*
+ * Clear the kobj flag from all the vdevs to allow
+ * vdev_cache_process_kobj_evt() to post events to all the
+ * vdevs since GUID is updated.
+ */
+ vdev_clear_kobj_evt(spa->spa_root_vdev);
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+ vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
+
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
}
@@ -948,8 +1044,8 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
{
ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
- bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
- bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+ memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
+ memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
avl_create(&spa->spa_errlist_scrub,
spa_error_entry_compare, sizeof (spa_error_entry_t),
@@ -968,17 +1064,33 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
uint_t count = ztip->zti_count;
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t cpus, flags = TASKQ_DYNAMIC;
- boolean_t batch = B_FALSE;
switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >, 0);
break;
- case ZTI_MODE_BATCH:
- batch = B_TRUE;
+ case ZTI_MODE_SYNC:
+
+ /*
+ * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
+ * not to exceed the number of spa allocators, and align to it.
+ */
+ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+ count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
+ count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
+ count = MIN(count, spa->spa_alloc_count);
+ while (spa->spa_alloc_count % count != 0 &&
+ spa->spa_alloc_count < count * 2)
+ count--;
+
+ /*
+ * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
+ * single taskq may have more threads than 100% of online cpus.
+ */
+ value = (zio_taskq_batch_pct + count / 2) / count;
+ value = MIN(value, 100);
flags |= TASKQ_THREADS_CPU_PCT;
- value = MIN(zio_taskq_batch_pct, 100);
break;
case ZTI_MODE_SCALE:
@@ -1025,7 +1137,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
- "spa_activate()",
+ "spa_taskqs_init()",
zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}
@@ -1045,13 +1157,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
(void) snprintf(name, sizeof (name), "%s_%s",
zio_type_name[t], zio_taskq_types[q]);
+#ifdef HAVE_SYSDC
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
- if (batch)
- flags |= TASKQ_DC_BATCH;
-
+ (void) zio_taskq_basedc;
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
} else {
+#endif
pri_t pri = maxclsyspri;
/*
* The write issue taskq can be extremely CPU
@@ -1077,7 +1189,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
}
tq = taskq_create_proc(name, value, pri, 50,
INT_MAX, spa->spa_proc, flags);
+#ifdef HAVE_SYSDC
}
+#endif
tqs->stqs_taskq[i] = tq;
}
@@ -1102,54 +1216,309 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tqs->stqs_taskq = NULL;
}
+#ifdef _KERNEL
/*
- * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
- * Note that a type may have multiple discrete taskqs to avoid lock contention
- * on the taskq itself. In that case we choose which taskq at random by using
- * the low bits of gethrtime().
+ * The READ and WRITE rows of zio_taskqs are configurable at module load time
+ * by setting zio_taskq_read or zio_taskq_write.
+ *
+ * Example (the defaults for READ and WRITE)
+ * zio_taskq_read='fixed,1,8 null scale null'
+ * zio_taskq_write='sync null scale null'
+ *
+ * Each sets the entire row at a time.
+ *
+ * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
+ * of threads per taskq.
+ *
+ * 'null' can only be set on the high-priority queues (queue selection for
+ * high-priority queues will fall back to the regular queue if the high-pri
+ * is NULL.
*/
-void
-spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
- task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+static const char *const modes[ZTI_NMODES] = {
+ "fixed", "scale", "sync", "null"
+};
+
+/* Parse the incoming config string. Modifies cfg */
+static int
+spa_taskq_param_set(zio_type_t t, char *cfg)
{
- spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
- taskq_t *tq;
+ int err = 0;
- ASSERT3P(tqs->stqs_taskq, !=, NULL);
- ASSERT3U(tqs->stqs_count, !=, 0);
+ zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
- if (tqs->stqs_count == 1) {
- tq = tqs->stqs_taskq[0];
- } else {
- tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
+ char *next = cfg, *tok, *c;
+
+ /*
+ * Parse out each element from the string and fill `row`. The entire
+ * row has to be set at once, so any errors are flagged by just
+ * breaking out of this loop early.
+ */
+ uint_t q;
+ for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ /* `next` is the start of the config */
+ if (next == NULL)
+ break;
+
+ /* Eat up leading space */
+ while (isspace(*next))
+ next++;
+ if (*next == '\0')
+ break;
+
+ /* Mode ends at space or end of string */
+ tok = next;
+ next = strchr(tok, ' ');
+ if (next != NULL) *next++ = '\0';
+
+ /* Parameters start after a comma */
+ c = strchr(tok, ',');
+ if (c != NULL) *c++ = '\0';
+
+ /* Match mode string */
+ uint_t mode;
+ for (mode = 0; mode < ZTI_NMODES; mode++)
+ if (strcmp(tok, modes[mode]) == 0)
+ break;
+ if (mode == ZTI_NMODES)
+ break;
+
+ /* Invalid canary */
+ row[q].zti_mode = ZTI_NMODES;
+
+ /* Per-mode setup */
+ switch (mode) {
+
+ /*
+ * FIXED is parameterised: number of queues, and number of
+ * threads per queue.
+ */
+ case ZTI_MODE_FIXED: {
+ /* No parameters? */
+ if (c == NULL || *c == '\0')
+ break;
+
+ /* Find next parameter */
+ tok = c;
+ c = strchr(tok, ',');
+ if (c == NULL)
+ break;
+
+ /* Take digits and convert */
+ unsigned long long nq;
+ if (!(isdigit(*tok)))
+ break;
+ err = ddi_strtoull(tok, &tok, 10, &nq);
+ /* Must succeed and also end at the next param sep */
+ if (err != 0 || tok != c)
+ break;
+
+ /* Move past the comma */
+ tok++;
+ /* Need another number */
+ if (!(isdigit(*tok)))
+ break;
+ /* Remember start to make sure we moved */
+ c = tok;
+
+ /* Take digits */
+ unsigned long long ntpq;
+ err = ddi_strtoull(tok, &tok, 10, &ntpq);
+ /* Must succeed, and moved forward */
+ if (err != 0 || tok == c || *tok != '\0')
+ break;
+
+ /*
+ * sanity; zero queues/threads make no sense, and
+ * 16K is almost certainly more than anyone will ever
+ * need and avoids silly numbers like UINT32_MAX
+ */
+ if (nq == 0 || nq >= 16384 ||
+ ntpq == 0 || ntpq >= 16384)
+ break;
+
+ const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
+ row[q] = zti;
+ break;
+ }
+
+ case ZTI_MODE_SCALE: {
+ const zio_taskq_info_t zti = ZTI_SCALE;
+ row[q] = zti;
+ break;
+ }
+
+ case ZTI_MODE_SYNC: {
+ const zio_taskq_info_t zti = ZTI_SYNC;
+ row[q] = zti;
+ break;
+ }
+
+ case ZTI_MODE_NULL: {
+ /*
+ * Can only null the high-priority queues; the general-
+ * purpose ones have to exist.
+ */
+ if (q != ZIO_TASKQ_ISSUE_HIGH &&
+ q != ZIO_TASKQ_INTERRUPT_HIGH)
+ break;
+
+ const zio_taskq_info_t zti = ZTI_NULL;
+ row[q] = zti;
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ /* Ensure we set a mode */
+ if (row[q].zti_mode == ZTI_NMODES)
+ break;
}
- taskq_dispatch_ent(tq, func, arg, flags, ent);
+ /* Didn't get a full row, fail */
+ if (q < ZIO_TASKQ_TYPES)
+ return (SET_ERROR(EINVAL));
+
+ /* Eat trailing space */
+ if (next != NULL)
+ while (isspace(*next))
+ next++;
+
+ /* If there's anything left over then fail */
+ if (next != NULL && *next != '\0')
+ return (SET_ERROR(EINVAL));
+
+ /* Success! Copy it into the real config */
+ for (q = 0; q < ZIO_TASKQ_TYPES; q++)
+ zio_taskqs[t][q] = row[q];
+
+ return (0);
+}
+
+static int
+spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
+{
+ int pos = 0;
+
+ /* Build paramater string from live config */
+ const char *sep = "";
+ for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
+ const zio_taskq_info_t *zti = &zio_taskqs[t][q];
+ if (zti->zti_mode == ZTI_MODE_FIXED)
+ pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
+ modes[zti->zti_mode], zti->zti_count,
+ zti->zti_value);
+ else
+ pos += sprintf(&buf[pos], "%s%s", sep,
+ modes[zti->zti_mode]);
+ sep = " ";
+ }
+
+ if (add_newline)
+ buf[pos++] = '\n';
+ buf[pos] = '\0';
+
+ return (pos);
+}
+
+#ifdef __linux__
+static int
+spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+ char *cfg = kmem_strdup(val);
+ int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
+ kmem_free(cfg, strlen(val)+1);
+ return (-err);
+}
+static int
+spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+ return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
}
+static int
+spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
+{
+ char *cfg = kmem_strdup(val);
+ int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
+ kmem_free(cfg, strlen(val)+1);
+ return (-err);
+}
+static int
+spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
+{
+ return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
+}
+#else
/*
- * Same as spa_taskq_dispatch_ent() but block on the task until completion.
+ * On FreeBSD load-time parameters can be set up before malloc() is available,
+ * so we have to do all the parsing work on the stack.
+ */
+#define SPA_TASKQ_PARAM_MAX (128)
+
+static int
+spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
+{
+ char buf[SPA_TASKQ_PARAM_MAX];
+ int err;
+
+ (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err || req->newptr == NULL)
+ return (err);
+ return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
+}
+
+static int
+spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
+{
+ char buf[SPA_TASKQ_PARAM_MAX];
+ int err;
+
+ (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err || req->newptr == NULL)
+ return (err);
+ return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
+}
+#endif
+#endif /* _KERNEL */
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself.
*/
void
-spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
- task_func_t *func, void *arg, uint_t flags)
+spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, zio_t *zio, boolean_t cutinline)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
taskq_t *tq;
- taskqid_t id;
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
+ /*
+ * NB: We are assuming that the zio can only be dispatched
+ * to a single taskq at a time. It would be a grievous error
+ * to dispatch the zio to another taskq at the same time.
+ */
+ ASSERT(zio);
+ ASSERT(taskq_empty_ent(&zio->io_tqent));
+
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
+ } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+ ZIO_HAS_ALLOCATOR(zio)) {
+ tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
}
- id = taskq_dispatch(tq, func, arg, flags);
- if (id)
- taskq_wait_id(tq, id);
+ taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0,
+ &zio->io_tqent);
}
static void
@@ -1162,11 +1531,6 @@ spa_create_zio_taskqs(spa_t *spa)
}
}
-/*
- * Disabled until spa_thread() can be adapted for Linux.
- */
-#undef HAVE_SPA_THREAD
-
#if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
static void
spa_thread(void *arg)
@@ -1207,9 +1571,11 @@ spa_thread(void *arg)
pool_unlock();
}
+#ifdef HAVE_SYSDC
if (zio_taskq_sysdc) {
sysdc_thread_enter(curthread, 100, 0);
}
+#endif
spa->spa_proc = curproc;
spa->spa_did = curthread->t_did;
@@ -1238,24 +1604,26 @@ spa_thread(void *arg)
}
#endif
+extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
+
/*
* Activate an uninitialized pool.
*/
static void
spa_activate(spa_t *spa, spa_mode_t mode)
{
+ metaslab_ops_t *msp = metaslab_allocator(spa);
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_mode = mode;
spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
- spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
- spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
- spa->spa_embedded_log_class =
- metaslab_class_create(spa, zfs_metaslab_ops);
- spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
- spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_normal_class = metaslab_class_create(spa, msp);
+ spa->spa_log_class = metaslab_class_create(spa, msp);
+ spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
+ spa->spa_special_class = metaslab_class_create(spa, msp);
+ spa->spa_dedup_class = metaslab_class_create(spa, msp);
/* Try to create a covering process */
mutex_enter(&spa->spa_proc_lock);
@@ -1313,6 +1681,11 @@ spa_activate(spa_t *spa, spa_mode_t mode)
avl_create(&spa->spa_errlist_last,
spa_error_entry_compare, sizeof (spa_error_entry_t),
offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_healed,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+
+ spa_activate_os(spa);
spa_keystore_init(&spa->spa_keystore);
@@ -1335,6 +1708,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
1, INT_MAX, 0);
/*
+ * The taskq to preload metaslabs.
+ */
+ spa->spa_metaslab_taskq = taskq_create("z_metaslab",
+ metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
+ TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
+ /*
* Taskq dedicated to prefetcher threads: this is used to prevent the
* pool traverse code from monopolizing the global (and limited)
* system_taskq by inappropriately scheduling long running tasks on it.
@@ -1369,6 +1749,11 @@ spa_deactivate(spa_t *spa)
spa->spa_zvol_taskq = NULL;
}
+ if (spa->spa_metaslab_taskq) {
+ taskq_destroy(spa->spa_metaslab_taskq);
+ spa->spa_metaslab_taskq = NULL;
+ }
+
if (spa->spa_prefetch_taskq) {
taskq_destroy(spa->spa_prefetch_taskq);
spa->spa_prefetch_taskq = NULL;
@@ -1421,6 +1806,7 @@ spa_deactivate(spa_t *spa)
spa_errlog_drain(spa);
avl_destroy(&spa->spa_errlist_scrub);
avl_destroy(&spa->spa_errlist_last);
+ avl_destroy(&spa->spa_errlist_healed);
spa_keystore_fini(&spa->spa_keystore);
@@ -1450,6 +1836,9 @@ spa_deactivate(spa_t *spa)
thread_join(spa->spa_did);
spa->spa_did = 0;
}
+
+ spa_deactivate_os(spa);
+
}
/*
@@ -1542,16 +1931,16 @@ spa_unload_log_sm_metadata(spa_t *spa)
{
void *cookie = NULL;
spa_log_sm_t *sls;
+ log_summary_entry_t *e;
+
while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
&cookie)) != NULL) {
VERIFY0(sls->sls_mscount);
kmem_free(sls, sizeof (spa_log_sm_t));
}
- for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
- e != NULL; e = list_head(&spa->spa_log_summary)) {
+ while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
VERIFY0(e->lse_mscount);
- list_remove(&spa->spa_log_summary, e);
kmem_free(e, sizeof (log_summary_entry_t));
}
@@ -1579,6 +1968,10 @@ spa_destroy_aux_threads(spa_t *spa)
zthr_destroy(spa->spa_livelist_condense_zthr);
spa->spa_livelist_condense_zthr = NULL;
}
+ if (spa->spa_raidz_expand_zthr != NULL) {
+ zthr_destroy(spa->spa_raidz_expand_zthr);
+ spa->spa_raidz_expand_zthr = NULL;
+ }
}
/*
@@ -1587,7 +1980,8 @@ spa_destroy_aux_threads(spa_t *spa)
static void
spa_unload(spa_t *spa)
{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
spa_import_progress_remove(spa_guid(spa));
@@ -1596,25 +1990,33 @@ spa_unload(spa_t *spa)
spa_wake_waiters(spa);
/*
- * If the log space map feature is enabled and the pool is getting
- * exported (but not destroyed), we want to spend some time flushing
- * as many metaslabs as we can in an attempt to destroy log space
- * maps and save import time.
+ * If we have set the spa_final_txg, we have already performed the
+ * tasks below in spa_export_common(). We should not redo it here since
+ * we delay the final TXGs beyond what spa_final_txg is set at.
*/
- if (spa_should_flush_logs_on_unload(spa))
- spa_unload_log_sm_flush_all(spa);
+ if (spa->spa_final_txg == UINT64_MAX) {
+ /*
+ * If the log space map feature is enabled and the pool is
+ * getting exported (but not destroyed), we want to spend some
+ * time flushing as many metaslabs as we can in an attempt to
+ * destroy log space maps and save import time.
+ */
+ if (spa_should_flush_logs_on_unload(spa))
+ spa_unload_log_sm_flush_all(spa);
- /*
- * Stop async tasks.
- */
- spa_async_suspend(spa);
+ /*
+ * Stop async tasks.
+ */
+ spa_async_suspend(spa);
- if (spa->spa_root_vdev) {
- vdev_t *root_vdev = spa->spa_root_vdev;
- vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
- vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
- vdev_autotrim_stop_all(spa);
- vdev_rebuild_stop_all(spa);
+ if (spa->spa_root_vdev) {
+ vdev_t *root_vdev = spa->spa_root_vdev;
+ vdev_initialize_stop_all(root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
+ vdev_autotrim_stop_all(spa);
+ vdev_rebuild_stop_all(spa);
+ }
}
/*
@@ -1629,13 +2031,7 @@ spa_unload(spa_t *spa)
* This ensures that there is no async metaslab prefetching
* while we attempt to unload the spa.
*/
- if (spa->spa_root_vdev != NULL) {
- for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
- vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
- if (vc->vdev_mg != NULL)
- taskq_wait(vc->vdev_mg->mg_taskq);
- }
- }
+ taskq_wait(spa->spa_metaslab_taskq);
if (spa->spa_mmp.mmp_thread)
mmp_thread_stop(spa);
@@ -1680,6 +2076,7 @@ spa_unload(spa_t *spa)
}
ddt_unload(spa);
+ brt_unload(spa);
spa_unload_log_sm_metadata(spa);
/*
@@ -1687,9 +2084,9 @@ spa_unload(spa_t *spa)
*/
spa_l2cache_drop(spa);
- for (int i = 0; i < spa->spa_spares.sav_count; i++)
- vdev_free(spa->spa_spares.sav_vdevs[i]);
if (spa->spa_spares.sav_vdevs) {
+ for (int i = 0; i < spa->spa_spares.sav_count; i++)
+ vdev_free(spa->spa_spares.sav_vdevs[i]);
kmem_free(spa->spa_spares.sav_vdevs,
spa->spa_spares.sav_count * sizeof (void *));
spa->spa_spares.sav_vdevs = NULL;
@@ -1700,11 +2097,11 @@ spa_unload(spa_t *spa)
}
spa->spa_spares.sav_count = 0;
- for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
- vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
- vdev_free(spa->spa_l2cache.sav_vdevs[i]);
- }
if (spa->spa_l2cache.sav_vdevs) {
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
+ vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+ }
kmem_free(spa->spa_l2cache.sav_vdevs,
spa->spa_l2cache.sav_count * sizeof (void *));
spa->spa_l2cache.sav_vdevs = NULL;
@@ -1728,6 +2125,8 @@ spa_unload(spa_t *spa)
spa->spa_compatibility = NULL;
}
+ spa->spa_raidz_expand = NULL;
+
spa_config_exit(spa, SCL_ALL, spa);
}
@@ -1762,20 +2161,21 @@ spa_load_spares(spa_t *spa)
/*
* First, close and free any existing spare vdevs.
*/
- for (i = 0; i < spa->spa_spares.sav_count; i++) {
- vd = spa->spa_spares.sav_vdevs[i];
+ if (spa->spa_spares.sav_vdevs) {
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
- /* Undo the call to spa_activate() below */
- if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
- B_FALSE)) != NULL && tvd->vdev_isspare)
- spa_spare_remove(tvd);
- vdev_close(vd);
- vdev_free(vd);
- }
+ /* Undo the call to spa_activate() below */
+ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+ B_FALSE)) != NULL && tvd->vdev_isspare)
+ spa_spare_remove(tvd);
+ vdev_close(vd);
+ vdev_free(vd);
+ }
- if (spa->spa_spares.sav_vdevs)
kmem_free(spa->spa_spares.sav_vdevs,
spa->spa_spares.sav_count * sizeof (void *));
+ }
if (spa->spa_spares.sav_config == NULL)
nspares = 0;
@@ -1851,7 +2251,8 @@ spa_load_spares(spa_t *spa)
spares[i] = vdev_config_generate(spa,
spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
- ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count);
+ ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
+ spa->spa_spares.sav_count);
for (i = 0; i < spa->spa_spares.sav_count; i++)
nvlist_free(spares[i]);
kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
@@ -1978,30 +2379,31 @@ spa_load_l2cache(spa_t *spa)
for (i = 0; i < sav->sav_count; i++)
l2cache[i] = vdev_config_generate(spa,
sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
- fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache,
- sav->sav_count);
+ fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+ (const nvlist_t * const *)l2cache, sav->sav_count);
out:
/*
* Purge vdevs that were dropped
*/
- for (i = 0; i < oldnvdevs; i++) {
- uint64_t pool;
+ if (oldvdevs) {
+ for (i = 0; i < oldnvdevs; i++) {
+ uint64_t pool;
- vd = oldvdevs[i];
- if (vd != NULL) {
- ASSERT(vd->vdev_isl2cache);
+ vd = oldvdevs[i];
+ if (vd != NULL) {
+ ASSERT(vd->vdev_isl2cache);
- if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
- pool != 0ULL && l2arc_vdev_present(vd))
- l2arc_remove_vdev(vd);
- vdev_clear_stats(vd);
- vdev_free(vd);
+ if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL && l2arc_vdev_present(vd))
+ l2arc_remove_vdev(vd);
+ vdev_clear_stats(vd);
+ vdev_free(vd);
+ }
}
- }
- if (oldvdevs)
kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+ }
for (i = 0; i < sav->sav_count; i++)
nvlist_free(l2cache[i]);
@@ -2107,8 +2509,8 @@ spa_check_for_missing_logs(spa_t *spa)
}
if (idx > 0) {
- fnvlist_add_nvlist_array(nv,
- ZPOOL_CONFIG_CHILDREN, child, idx);
+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ (const nvlist_t * const *)child, idx);
fnvlist_add_nvlist(spa->spa_load_info,
ZPOOL_CONFIG_MISSING_DEVICES, nv);
@@ -2243,12 +2645,13 @@ spa_claim_notify(zio_t *zio)
return;
mutex_enter(&spa->spa_props_lock); /* any mutex will do */
- if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
- spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
+ spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
mutex_exit(&spa->spa_props_lock);
}
typedef struct spa_load_error {
+ boolean_t sle_verify_data;
uint64_t sle_meta_count;
uint64_t sle_data_count;
} spa_load_error_t;
@@ -2281,18 +2684,19 @@ spa_load_verify_done(zio_t *zio)
* Maximum number of inflight bytes is the log2 fraction of the arc size.
* By default, we set it to 1/16th of the arc.
*/
-int spa_load_verify_shift = 4;
-int spa_load_verify_metadata = B_TRUE;
-int spa_load_verify_data = B_TRUE;
+static uint_t spa_load_verify_shift = 4;
+static int spa_load_verify_metadata = B_TRUE;
+static int spa_load_verify_data = B_TRUE;
-/*ARGSUSED*/
static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
- if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
- BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
- return (0);
+ zio_t *rio = arg;
+ spa_load_error_t *sle = rio->io_private;
+
+ (void) zilog, (void) dnp;
+
/*
* Note: normally this routine will not be called if
* spa_load_verify_metadata is not set. However, it may be useful
@@ -2300,12 +2704,28 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
if (!spa_load_verify_metadata)
return (0);
- if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
+
+ /*
+ * Sanity check the block pointer in order to detect obvious damage
+ * before using the contents in subsequent checks or in zio_read().
+ * When damaged consider it to be a metadata error since we cannot
+ * trust the BP_GET_TYPE and BP_GET_LEVEL values.
+ */
+ if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
+ atomic_inc_64(&sle->sle_meta_count);
+ return (0);
+ }
+
+ if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+ BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
+ return (0);
+
+ if (!BP_IS_METADATA(bp) &&
+ (!spa_load_verify_data || !sle->sle_verify_data))
return (0);
uint64_t maxinflight_bytes =
arc_target_bytes() >> spa_load_verify_shift;
- zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
mutex_enter(&spa->spa_scrub_lock);
@@ -2321,10 +2741,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
return (0);
}
-/* ARGSUSED */
static int
verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
+ (void) dp, (void) arg;
+
if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
@@ -2342,7 +2763,8 @@ spa_load_verify(spa_t *spa)
zpool_get_load_policy(spa->spa_config, &policy);
- if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
+ if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
+ policy.zlp_maxmeta == UINT64_MAX)
return (0);
dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
@@ -2353,6 +2775,13 @@ spa_load_verify(spa_t *spa)
if (error != 0)
return (error);
+ /*
+ * Verify data only if we are rewinding or error limit was set.
+ * Otherwise nothing except dbgmsg care about it to waste time.
+ */
+ sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
+ (policy.zlp_maxdata < UINT64_MAX);
+
rio = zio_root(spa, NULL, &sle,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
@@ -2397,6 +2826,8 @@ spa_load_verify(spa_t *spa)
fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
loss);
fnvlist_add_uint64(spa->spa_load_info,
+ ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
+ fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
} else {
spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
@@ -2454,10 +2885,10 @@ spa_livelist_delete_check(spa_t *spa)
return (spa->spa_livelists_to_delete != 0);
}
-/* ARGSUSED */
static boolean_t
spa_livelist_delete_cb_check(void *arg, zthr_t *z)
{
+ (void) z;
spa_t *spa = arg;
return (spa_livelist_delete_check(spa));
}
@@ -2549,7 +2980,6 @@ livelist_delete_sync(void *arg, dmu_tx_t *tx)
* be freed. Then, call a synctask which performs the actual frees and updates
* the pool-wide livelist data.
*/
-/* ARGSUSED */
static void
spa_livelist_delete_cb(void *arg, zthr_t *z)
{
@@ -2795,7 +3225,6 @@ spa_livelist_condense_cb(void *arg, zthr_t *t)
zfs_livelist_condense_zthr_cancel++;
}
-/* ARGSUSED */
/*
* Check that there is something to condense but that a condense is not
* already in progress and that condensing has not been cancelled.
@@ -2803,6 +3232,7 @@ spa_livelist_condense_cb(void *arg, zthr_t *t)
static boolean_t
spa_livelist_condense_cb_check(void *arg, zthr_t *z)
{
+ (void) z;
spa_t *spa = arg;
if ((spa->spa_to_condense.ds != NULL) &&
(spa->spa_to_condense.syncing == B_FALSE) &&
@@ -2833,8 +3263,7 @@ spa_spawn_aux_threads(spa_t *spa)
{
ASSERT(spa_writeable(spa));
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
+ spa_start_raidz_expansion_thread(spa);
spa_start_indirect_condensing_thread(spa);
spa_start_livelist_destroy_thread(spa);
spa_start_livelist_condensing_thread(spa);
@@ -2931,12 +3360,13 @@ spa_try_repair(spa_t *spa, nvlist_t *config)
static int
spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
{
- char *ereport = FM_EREPORT_ZFS_POOL;
+ const char *ereport = FM_EREPORT_ZFS_POOL;
int error;
spa->spa_load_state = state;
(void) spa_import_progress_set_state(spa_guid(spa),
spa_load_state(spa));
+ spa_import_progress_set_notes(spa, "spa_load()");
gethrestime(&spa->spa_loaded_ts);
error = spa_load_impl(spa, type, &ereport);
@@ -2978,6 +3408,12 @@ vdev_count_verify_zaps(vdev_t *vd)
spa_t *spa = vd->vdev_spa;
uint64_t total = 0;
+ if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
+ vd->vdev_root_zap != 0) {
+ total++;
+ ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+ spa->spa_all_vdev_zaps, vd->vdev_root_zap));
+ }
if (vd->vdev_top_zap != 0) {
total++;
ASSERT0(zap_lookup_int(spa->spa_meta_objset,
@@ -2995,6 +3431,8 @@ vdev_count_verify_zaps(vdev_t *vd)
return (total);
}
+#else
+#define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
#endif
/*
@@ -3146,18 +3584,23 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
- * Perform the import activity check. If the user canceled the import or
- * we detected activity then fail.
+ * Remote host activity check.
+ *
+ * error results:
+ * 0 - no activity detected
+ * EREMOTEIO - remote activity detected
+ * EINTR - user canceled the operation
*/
static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
+ boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
uint64_t mmp_config = ub->ub_mmp_config;
uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
uint64_t import_delay;
- hrtime_t import_expire;
+ hrtime_t import_expire, now;
nvlist_t *mmp_label = NULL;
vdev_t *rvd = spa->spa_root_vdev;
kcondvar_t cv;
@@ -3195,9 +3638,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
- while (gethrtime() < import_expire) {
- (void) spa_import_progress_set_mmp_check(spa_guid(spa),
- NSEC2SEC(import_expire - gethrtime()));
+ if (importing) {
+ spa_import_progress_set_notes(spa, "Checking MMP activity, "
+ "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+ }
+
+ int iterations = 0;
+ while ((now = gethrtime()) < import_expire) {
+ if (importing && iterations++ % 30 == 0) {
+ spa_import_progress_set_notes(spa, "Checking MMP "
+ "activity, %llu ms remaining",
+ (u_longlong_t)NSEC2MSEC(import_expire - now));
+ }
+
+ if (importing) {
+ (void) spa_import_progress_set_mmp_check(spa_guid(spa),
+ NSEC2SEC(import_expire - gethrtime()));
+ }
vdev_uberblock_load(rvd, ub, &mmp_label);
@@ -3246,7 +3703,7 @@ out:
* ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
*/
if (error == EREMOTEIO) {
- char *hostname = "<unknown>";
+ const char *hostname = "<unknown>";
uint64_t hostid = 0;
if (mmp_label) {
@@ -3279,11 +3736,66 @@ out:
return (error);
}
+/*
+ * Called from zfs_ioc_clear for a pool that was suspended
+ * after failing mmp write checks.
+ */
+boolean_t
+spa_mmp_remote_host_activity(spa_t *spa)
+{
+ ASSERT(spa_multihost(spa) && spa_suspended(spa));
+
+ nvlist_t *best_label;
+ uberblock_t best_ub;
+
+ /*
+ * Locate the best uberblock on disk
+ */
+ vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
+ if (best_label) {
+ /*
+ * confirm that the best hostid matches our hostid
+ */
+ if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
+ spa_get_hostid(spa) !=
+ fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
+ nvlist_free(best_label);
+ return (B_TRUE);
+ }
+ nvlist_free(best_label);
+ } else {
+ return (B_TRUE);
+ }
+
+ if (!MMP_VALID(&best_ub) ||
+ !MMP_FAIL_INT_VALID(&best_ub) ||
+ MMP_FAIL_INT(&best_ub) == 0) {
+ return (B_TRUE);
+ }
+
+ if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
+ best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
+ zfs_dbgmsg("txg mismatch detected during pool clear "
+ "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
+ (u_longlong_t)spa->spa_uberblock.ub_txg,
+ (u_longlong_t)best_ub.ub_txg,
+ (u_longlong_t)spa->spa_uberblock.ub_timestamp,
+ (u_longlong_t)best_ub.ub_timestamp);
+ return (B_TRUE);
+ }
+
+ /*
+ * Perform an activity check looking for any remote writer
+ */
+ return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
+ B_FALSE) != 0);
+}
+
static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
uint64_t hostid;
- char *hostname;
+ const char *hostname;
uint64_t myhostid = 0;
if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
@@ -3318,8 +3830,8 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
int parse;
vdev_t *rvd;
uint64_t pool_guid;
- char *comment;
- char *compatibility;
+ const char *comment;
+ const char *compatibility;
/*
* Versioning wasn't explicitly added to the label until later, so if
@@ -3581,6 +4093,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
}
spa_load_note(spa, "using uberblock with txg=%llu",
(u_longlong_t)ub->ub_txg);
+ if (ub->ub_raidz_reflow_info != 0) {
+ spa_load_note(spa, "uberblock raidz_reflow_info: "
+ "state=%u offset=%llu",
+ (int)RRSS_GET_STATE(ub),
+ (u_longlong_t)RRSS_GET_OFFSET(ub));
+ }
/*
@@ -3599,7 +4117,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
- int error = spa_activity_check(spa, ub, spa->spa_config);
+ int error =
+ spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
@@ -3806,6 +4325,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
spa_config_exit(spa, SCL_ALL, FTAG);
/*
+ * If 'zpool import' used a cached config, then the on-disk hostid and
+ * hostname may be different to the cached config in ways that should
+ * prevent import. Userspace can't discover this without a scan, but
+ * we know, so we add these values to LOAD_INFO so the caller can know
+ * the difference.
+ *
+ * Note that we have to do this before the config is regenerated,
+ * because the new config will have the hostid and hostname for this
+ * host, in readiness for import.
+ */
+ if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
+ fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
+ fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
+ if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
+ fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
+ fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
+
+ /*
* We will use spa_config if we decide to reload the spa or if spa_load
* fails and we rewind. We must thus regenerate the config using the
* MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
@@ -4163,6 +4700,7 @@ spa_ld_get_props(spa_t *spa)
spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
} else if (error != 0) {
+ nvlist_free(mos_config);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
/*
@@ -4323,7 +4861,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
error = spa_ld_log_spacemaps(spa);
if (error != 0) {
- spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+ spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
error);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
}
@@ -4354,7 +4892,22 @@ spa_ld_load_dedup_tables(spa_t *spa)
}
static int
-spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
+spa_ld_load_brt(spa_t *spa)
+{
+ int error = 0;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ error = brt_load(spa);
+ if (error != 0) {
+ spa_load_failed(spa, "brt_load failed [error=%d]", error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
+ return (0);
+}
+
+static int
+spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
{
vdev_t *rvd = spa->spa_root_vdev;
@@ -4481,7 +5034,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa)
int error = 0;
ASSERT0(spa->spa_checkpoint_txg);
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_load_thread == curthread);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
@@ -4721,13 +5275,14 @@ spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
* config stored in the MOS.
*/
static int
-spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
{
int error = 0;
boolean_t missing_feat_write = B_FALSE;
boolean_t checkpoint_rewind =
(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
boolean_t update_config_cache = B_FALSE;
+ hrtime_t load_start = gethrtime();
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@@ -4773,11 +5328,18 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
}
/*
+ * Drop the namespace lock for the rest of the function.
+ */
+ spa->spa_load_thread = curthread;
+ mutex_exit(&spa_namespace_lock);
+
+ /*
* Retrieve the checkpoint txg if the pool has a checkpoint.
*/
+ spa_import_progress_set_notes(spa, "Loading checkpoint txg");
error = spa_ld_read_checkpoint_txg(spa);
if (error != 0)
- return (error);
+ goto fail;
/*
* Retrieve the mapping of indirect vdevs. Those vdevs were removed
@@ -4787,60 +5349,73 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* initiated. Otherwise we could be reading from indirect vdevs before
* we have loaded their mappings.
*/
+ spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
error = spa_ld_open_indirect_vdev_metadata(spa);
if (error != 0)
- return (error);
+ goto fail;
/*
* Retrieve the full list of active features from the MOS and check if
* they are all supported.
*/
+ spa_import_progress_set_notes(spa, "Checking feature flags");
error = spa_ld_check_features(spa, &missing_feat_write);
if (error != 0)
- return (error);
+ goto fail;
/*
* Load several special directories from the MOS needed by the dsl_pool
* layer.
*/
+ spa_import_progress_set_notes(spa, "Loading special MOS directories");
error = spa_ld_load_special_directories(spa);
if (error != 0)
- return (error);
+ goto fail;
/*
* Retrieve pool properties from the MOS.
*/
+ spa_import_progress_set_notes(spa, "Loading properties");
error = spa_ld_get_props(spa);
if (error != 0)
- return (error);
+ goto fail;
/*
* Retrieve the list of auxiliary devices - cache devices and spares -
* and open them.
*/
+ spa_import_progress_set_notes(spa, "Loading AUX vdevs");
error = spa_ld_open_aux_vdevs(spa, type);
if (error != 0)
- return (error);
+ goto fail;
/*
* Load the metadata for all vdevs. Also check if unopenable devices
* should be autoreplaced.
*/
+ spa_import_progress_set_notes(spa, "Loading vdev metadata");
error = spa_ld_load_vdev_metadata(spa);
if (error != 0)
- return (error);
+ goto fail;
+ spa_import_progress_set_notes(spa, "Loading dedup tables");
error = spa_ld_load_dedup_tables(spa);
if (error != 0)
- return (error);
+ goto fail;
+
+ spa_import_progress_set_notes(spa, "Loading BRT");
+ error = spa_ld_load_brt(spa);
+ if (error != 0)
+ goto fail;
/*
* Verify the logs now to make sure we don't have any unexpected errors
* when we claim log blocks later.
*/
+ spa_import_progress_set_notes(spa, "Verifying Log Devices");
error = spa_ld_verify_logs(spa, type, ereport);
if (error != 0)
- return (error);
+ goto fail;
if (missing_feat_write) {
ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
@@ -4850,8 +5425,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* read-only mode but not read-write mode. We now have enough
* information and can return to userland.
*/
- return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
- ENOTSUP));
+ error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
+ ENOTSUP);
+ goto fail;
}
/*
@@ -4859,15 +5435,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* state. When performing an extreme rewind, we verify the whole pool,
* which can take a very long time.
*/
+ spa_import_progress_set_notes(spa, "Verifying pool data");
error = spa_ld_verify_pool_data(spa);
if (error != 0)
- return (error);
+ goto fail;
/*
* Calculate the deflated space for the pool. This must be done before
* we write anything to the pool because we'd need to update the space
* accounting using the deflated sizes.
*/
+ spa_import_progress_set_notes(spa, "Calculating deflated space");
spa_update_dspace(spa);
/*
@@ -4875,6 +5453,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* pool. If we are importing the pool in read-write mode, a few
* additional steps must be performed to finish the import.
*/
+ spa_import_progress_set_notes(spa, "Starting import");
if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
spa->spa_load_max_txg == UINT64_MAX)) {
uint64_t config_cache_txg = spa->spa_config_txg;
@@ -4882,6 +5461,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
/*
+ * Before we do any zio_write's, complete the raidz expansion
+ * scratch space copying, if necessary.
+ */
+ if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
+ vdev_raidz_reflow_copy_scratch(spa);
+
+ /*
* In case of a checkpoint rewind, log the original txg
* of the checkpointed uberblock.
*/
@@ -4891,6 +5477,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
(u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
}
+ spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
/*
* Traverse the ZIL and claim all blocks.
*/
@@ -4910,6 +5497,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* will have been set for us by ZIL traversal operations
* performed above.
*/
+ spa_import_progress_set_notes(spa, "Syncing ZIL claims");
txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
/*
@@ -4917,6 +5505,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* next sync, we would update the config stored in vdev labels
* and the cachefile (by default /etc/zfs/zpool.cache).
*/
+ spa_import_progress_set_notes(spa, "Updating configs");
spa_ld_check_for_config_update(spa, config_cache_txg,
update_config_cache);
@@ -4925,6 +5514,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* Then check all DTLs to see if anything needs resilvering.
* The resilver will be deferred if a rebuild was started.
*/
+ spa_import_progress_set_notes(spa, "Starting resilvers");
if (vdev_rebuild_active(spa->spa_root_vdev)) {
vdev_rebuild_restart(spa);
} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
@@ -4938,6 +5528,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
*/
spa_history_log_version(spa, "open", NULL);
+ spa_import_progress_set_notes(spa,
+ "Restarting device removals");
spa_restart_removal(spa);
spa_spawn_aux_threads(spa);
@@ -4950,27 +5542,40 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* auxiliary threads above (from which the livelist
* deletion zthr is part of).
*/
+ spa_import_progress_set_notes(spa,
+ "Cleaning up inconsistent objsets");
(void) dmu_objset_find(spa_name(spa),
dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
/*
* Clean up any stale temporary dataset userrefs.
*/
+ spa_import_progress_set_notes(spa,
+ "Cleaning up temporary userrefs");
dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_import_progress_set_notes(spa, "Restarting initialize");
vdev_initialize_restart(spa->spa_root_vdev);
+ spa_import_progress_set_notes(spa, "Restarting TRIM");
vdev_trim_restart(spa->spa_root_vdev);
vdev_autotrim_restart(spa);
spa_config_exit(spa, SCL_CONFIG, FTAG);
+ spa_import_progress_set_notes(spa, "Finished importing");
}
+ zio_handle_import_delay(spa, gethrtime() - load_start);
spa_import_progress_remove(spa_guid(spa));
spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
spa_load_note(spa, "LOADED");
+fail:
+ mutex_enter(&spa_namespace_lock);
+ spa->spa_load_thread = NULL;
+ cv_broadcast(&spa_namespace_cv);
+
+ return (error);
- return (0);
}
static int
@@ -5112,8 +5717,8 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
* ambiguous state.
*/
static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
- nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, const void *tag,
+ nvlist_t *nvpolicy, nvlist_t **config)
{
spa_t *spa;
spa_load_state_t state = SPA_LOAD_OPEN;
@@ -5170,7 +5775,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
*/
spa_unload(spa);
spa_deactivate(spa);
- spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
spa_remove(spa);
if (locked)
mutex_exit(&spa_namespace_lock);
@@ -5208,7 +5813,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
* If we've recovered the pool, pass back any information we
* gathered while doing the load.
*/
- if (state == SPA_LOAD_RECOVER) {
+ if (state == SPA_LOAD_RECOVER && config != NULL) {
fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
spa->spa_load_info);
}
@@ -5229,14 +5834,14 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
}
int
-spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
- nvlist_t **config)
+spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
+ nvlist_t *policy, nvlist_t **config)
{
return (spa_open_common(name, spapp, tag, policy, config));
}
int
-spa_open(const char *name, spa_t **spapp, void *tag)
+spa_open(const char *name, spa_t **spapp, const void *tag)
{
return (spa_open_common(name, spapp, tag, NULL, NULL));
}
@@ -5292,8 +5897,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, &spares, &nspares));
if (nspares != 0) {
- fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares,
- nspares);
+ fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ (const nvlist_t * const *)spares, nspares);
VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares));
@@ -5305,13 +5910,15 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
for (i = 0; i < nspares; i++) {
guid = fnvlist_lookup_uint64(spares[i],
ZPOOL_CONFIG_GUID);
+ VERIFY0(nvlist_lookup_uint64_array(spares[i],
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
if (spa_spare_exists(guid, &pool, NULL) &&
pool != 0ULL) {
- VERIFY0(nvlist_lookup_uint64_array(spares[i],
- ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs,
- &vsc));
vs->vs_state = VDEV_STATE_CANT_OPEN;
vs->vs_aux = VDEV_AUX_SPARED;
+ } else {
+ vs->vs_state =
+ spa->spa_spares.sav_vdevs[i]->vdev_state;
}
}
}
@@ -5340,8 +5947,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
if (nl2cache != 0) {
- fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache,
- nl2cache);
+ fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ (const nvlist_t * const *)l2cache, nl2cache);
VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache));
@@ -5484,7 +6091,7 @@ spa_get_stats(const char *name, nvlist_t **config,
fnvlist_add_uint64(*config,
ZPOOL_CONFIG_ERRCOUNT,
- spa_get_errlog_size(spa));
+ spa_approx_errlog_size(spa));
if (spa_suspended(spa)) {
fnvlist_add_uint64(*config,
@@ -5648,8 +6255,8 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
fnvlist_remove(sav->sav_config, config);
- fnvlist_add_nvlist_array(sav->sav_config, config, newdevs,
- ndevs + oldndevs);
+ fnvlist_add_nvlist_array(sav->sav_config, config,
+ (const nvlist_t * const *)newdevs, ndevs + oldndevs);
for (i = 0; i < oldndevs + ndevs; i++)
nvlist_free(newdevs[i]);
kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
@@ -5658,7 +6265,8 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
* Generate a new dev list.
*/
sav->sav_config = fnvlist_alloc();
- fnvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs);
+ fnvlist_add_nvlist_array(sav->sav_config, config,
+ (const nvlist_t * const *)devs, ndevs);
}
}
@@ -5708,7 +6316,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
nvlist_t *zplprops, dsl_crypto_params_t *dcp)
{
spa_t *spa;
- char *altroot = NULL;
+ const char *altroot = NULL;
vdev_t *rvd;
dsl_pool_t *dp;
dmu_tx_t *tx;
@@ -5721,12 +6329,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
boolean_t has_encryption;
boolean_t has_allocclass;
spa_feature_t feat;
- char *feat_name;
- char *poolname;
+ const char *feat_name;
+ const char *poolname;
nvlist_t *nvl;
if (props == NULL ||
- nvlist_lookup_string(props, "tname", &poolname) != 0)
+ nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
poolname = (char *)pool;
/*
@@ -5869,7 +6478,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
&spares, &nspares) == 0) {
spa->spa_spares.sav_config = fnvlist_alloc();
fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
- ZPOOL_CONFIG_SPARES, spares, nspares);
+ ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
+ nspares);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_spares(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -5881,9 +6491,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
*/
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0) {
- spa->spa_l2cache.sav_config = fnvlist_alloc();
+ VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP));
fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache);
+ ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
+ nl2cache);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_l2cache(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -5898,6 +6510,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
* Create DDTs (dedup tables).
*/
ddt_create(spa);
+ /*
+ * Create BRT table and BRT table object.
+ */
+ brt_create(spa);
spa_update_dspace(spa);
@@ -5990,7 +6606,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa_spawn_aux_threads(spa);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
/*
* Don't count references from objsets that are already closed
@@ -6000,6 +6616,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
spa->spa_load_state = SPA_LOAD_NONE;
+ spa_import_os(spa);
+
mutex_exit(&spa_namespace_lock);
return (0);
@@ -6012,7 +6630,7 @@ int
spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
{
spa_t *spa;
- char *altroot = NULL;
+ const char *altroot = NULL;
spa_load_state_t state = SPA_LOAD_IMPORT;
zpool_load_policy_t policy;
spa_mode_t mode = spa_mode_global;
@@ -6051,7 +6669,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
if (props != NULL)
spa_configfile_set(spa, props, B_FALSE);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
zfs_dbgmsg("spa_import: verbatim import of %s", pool);
mutex_exit(&spa_namespace_lock);
@@ -6131,7 +6749,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
else
spa->spa_spares.sav_config = fnvlist_alloc();
fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
- ZPOOL_CONFIG_SPARES, spares, nspares);
+ ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
+ nspares);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_spares(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -6145,7 +6764,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
else
spa->spa_l2cache.sav_config = fnvlist_alloc();
fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
- ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache);
+ ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
+ nl2cache);
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa_load_l2cache(spa);
spa_config_exit(spa, SCL_ALL, FTAG);
@@ -6181,6 +6801,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
zvol_create_minors_recursive(pool);
+ spa_import_os(spa);
+
return (0);
}
@@ -6188,7 +6810,7 @@ nvlist_t *
spa_tryimport(nvlist_t *tryconfig)
{
nvlist_t *config = NULL;
- char *poolname, *cachefile;
+ const char *poolname, *cachefile;
spa_t *spa;
uint64_t state;
int error;
@@ -6203,9 +6825,14 @@ spa_tryimport(nvlist_t *tryconfig)
/*
* Create and initialize the spa structure.
*/
+ char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
+ TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
+
mutex_enter(&spa_namespace_lock);
- spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
+ spa = spa_add(name, tryconfig, NULL);
spa_activate(spa, SPA_MODE_READ);
+ kmem_free(name, MAXPATHLEN);
/*
* Rewind pool if a max txg was provided.
@@ -6228,6 +6855,16 @@ spa_tryimport(nvlist_t *tryconfig)
spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
}
+ /*
+ * spa_import() relies on a pool config fetched by spa_try_import()
+ * for spare/cache devices. Import flags are not passed to
+ * spa_tryimport(), which makes it return early due to a missing log
+ * device and missing retrieving the cache device and spare eventually.
+ * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
+ * the correct configuration regardless of the missing log device.
+ */
+ spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
+
error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
/*
@@ -6308,8 +6945,9 @@ static int
spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force, boolean_t hardforce)
{
- int error;
+ int error = 0;
spa_t *spa;
+ hrtime_t export_start = gethrtime();
if (oldconfig)
*oldconfig = NULL;
@@ -6331,8 +6969,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa->spa_is_exporting = B_TRUE;
/*
- * Put a hold on the pool, drop the namespace lock, stop async tasks,
- * reacquire the namespace lock, and see if we can export.
+ * Put a hold on the pool, drop the namespace lock, stop async tasks
+ * and see if we can export.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
@@ -6342,10 +6980,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
+ spa->spa_export_thread = curthread;
spa_close(spa, FTAG);
- if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+ if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+ mutex_exit(&spa_namespace_lock);
goto export_spa;
+ }
+
/*
* The pool will be in core if it's openable, in which case we can
* modify its state. Objsets may be open only because they're dirty,
@@ -6366,7 +7008,16 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
goto fail;
}
+ mutex_exit(&spa_namespace_lock);
+ /*
+ * At this point we no longer hold the spa_namespace_lock and
+ * there were no references on the spa. Future spa_lookups will
+ * notice the spa->spa_export_thread and wait until we signal
+ * that we are finshed.
+ */
+
if (spa->spa_sync_on) {
+ vdev_t *rvd = spa->spa_root_vdev;
/*
* A pool cannot be exported if it has an active shared spare.
* This is to prevent other pools stealing the active spare
@@ -6376,6 +7027,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (!force && new_state == POOL_STATE_EXPORTED &&
spa_has_active_shared_spare(spa)) {
error = SET_ERROR(EXDEV);
+ mutex_enter(&spa_namespace_lock);
goto fail;
}
@@ -6386,13 +7038,10 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
* dirty data resulting from the initialization is
* committed to disk before we unload the pool.
*/
- if (spa->spa_root_vdev != NULL) {
- vdev_t *rvd = spa->spa_root_vdev;
- vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
- vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
- vdev_autotrim_stop_all(spa);
- vdev_rebuild_stop_all(spa);
- }
+ vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
+ vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
+ vdev_autotrim_stop_all(spa);
+ vdev_rebuild_stop_all(spa);
/*
* We want this to be reflected on every label,
@@ -6402,14 +7051,34 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_state = new_state;
+ vdev_config_dirty(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ }
+
+ /*
+ * If the log space map feature is enabled and the pool is
+ * getting exported (but not destroyed), we want to spend some
+ * time flushing as many metaslabs as we can in an attempt to
+ * destroy log space maps and save import time. This has to be
+ * done before we set the spa_final_txg, otherwise
+ * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
+ * spa_should_flush_logs_on_unload() should be called after
+ * spa_state has been set to the new_state.
+ */
+ if (spa_should_flush_logs_on_unload(spa))
+ spa_unload_log_sm_flush_all(spa);
+
+ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
spa->spa_final_txg = spa_last_synced_txg(spa) +
TXG_DEFER_SIZE + 1;
- vdev_config_dirty(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG);
}
}
export_spa:
+ spa_export_os(spa);
+
if (new_state == POOL_STATE_DESTROYED)
spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
else if (new_state == POOL_STATE_EXPORTED)
@@ -6423,9 +7092,16 @@ export_spa:
if (oldconfig && spa->spa_config)
*oldconfig = fnvlist_dup(spa->spa_config);
+ if (new_state == POOL_STATE_EXPORTED)
+ zio_handle_export_delay(spa, gethrtime() - export_start);
+
+ /*
+ * Take the namespace lock for the actual spa_t removal
+ */
+ mutex_enter(&spa_namespace_lock);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!hardforce)
- spa_write_cachefile(spa, B_TRUE, B_TRUE);
+ spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
spa_remove(spa);
} else {
/*
@@ -6434,14 +7110,25 @@ export_spa:
* we make sure to reset the exporting flag.
*/
spa->spa_is_exporting = B_FALSE;
+ spa->spa_export_thread = NULL;
}
+ /*
+ * Wake up any waiters in spa_lookup()
+ */
+ cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (0);
fail:
spa->spa_is_exporting = B_FALSE;
+ spa->spa_export_thread = NULL;
+
spa_async_resume(spa);
+ /*
+ * Wake up any waiters in spa_lookup()
+ */
+ cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (error);
}
@@ -6501,7 +7188,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
* Add a device to a storage pool.
*/
int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
{
uint64_t txg, ndraid = 0;
int error;
@@ -6592,6 +7279,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
}
+ if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ tvd = vd->vdev_child[c];
+ if (tvd->vdev_ashift != spa->spa_max_ashift) {
+ return (spa_vdev_exit(spa, vd, txg,
+ ZFS_ERR_ASHIFT_MISMATCH));
+ }
+ }
+ }
+
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
@@ -6651,9 +7348,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
/*
- * Attach a device to a mirror. The arguments are the path to any device
- * in the mirror, and the nvroot for the new device. If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
+ * Attach a device to a vdev specified by its guid. The vdev type can be
+ * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
+ * single device). When the vdev is a single device, a mirror vdev will be
+ * automatically inserted.
*
* If 'replacing' is specified, the new device is intended to replace the
* existing device; in this case the two devices are made into their own
@@ -6676,7 +7374,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
- int newvd_isspare;
+ int newvd_isspare = B_FALSE;
int error;
ASSERT(spa_writeable(spa));
@@ -6696,28 +7394,49 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
- if (dsl_scan_resilvering(spa_get_dsl(spa)))
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
+ dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_RESILVER_IN_PROGRESS));
+ }
} else {
if (vdev_rebuild_active(rvd))
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_REBUILD_IN_PROGRESS));
}
- if (spa->spa_vdev_removal != NULL)
- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+ if (spa->spa_vdev_removal != NULL) {
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_DEVRM_IN_PROGRESS));
+ }
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
- if (!oldvd->vdev_ops->vdev_op_leaf)
+ boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
+
+ if (raidz) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * Can't expand a raidz while prior expand is in progress.
+ */
+ if (spa->spa_raidz_expand != NULL) {
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+ }
+ } else if (!oldvd->vdev_ops->vdev_op_leaf) {
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ }
- pvd = oldvd->vdev_parent;
+ if (raidz)
+ pvd = oldvd;
+ else
+ pvd = oldvd->vdev_parent;
- if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
- VDEV_ALLOC_ATTACH)) != 0)
+ if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+ VDEV_ALLOC_ATTACH) != 0)
return (spa_vdev_exit(spa, NULL, txg, EINVAL));
if (newrootvd->vdev_children != 1)
@@ -6732,10 +7451,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
return (spa_vdev_exit(spa, newrootvd, txg, error));
/*
- * Spares can't replace logs
+ * log, dedup and special vdevs should not be replaced by spares.
*/
- if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
+ if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
+ oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
/*
* A dRAID spare can only replace a child of its parent dRAID vdev.
@@ -6764,11 +7485,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (!replacing) {
/*
- * For attach, the only allowable parent is a mirror or the root
- * vdev.
+ * For attach, the only allowable parent is a mirror or
+ * the root vdev. A raidz vdev can be attached to, but
+ * you cannot attach to a raidz child.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
- pvd->vdev_ops != &vdev_root_ops)
+ pvd->vdev_ops != &vdev_root_ops &&
+ !raidz)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
pvops = &vdev_mirror_ops;
@@ -6807,7 +7530,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
/*
* Make sure the new device is big enough.
*/
- if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+ if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
@@ -6818,31 +7542,74 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
+ * RAIDZ-expansion-specific checks.
+ */
+ if (raidz) {
+ if (vdev_raidz_attach_check(newvd) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * Fail early if a child is not healthy or being replaced
+ */
+ for (int i = 0; i < oldvd->vdev_children; i++) {
+ if (vdev_is_dead(oldvd->vdev_child[i]) ||
+ !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
+ return (spa_vdev_exit(spa, newrootvd, txg,
+ ENXIO));
+ }
+ /* Also fail if reserved boot area is in-use */
+ if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
+ != 0) {
+ return (spa_vdev_exit(spa, newrootvd, txg,
+ EADDRINUSE));
+ }
+ }
+ }
+
+ if (raidz) {
+ /*
+ * Note: oldvdpath is freed by spa_strfree(), but
+ * kmem_asprintf() is freed by kmem_strfree(), so we have to
+ * move it to a spa_strdup-ed string.
+ */
+ char *tmp = kmem_asprintf("raidz%u-%u",
+ (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
+ oldvdpath = spa_strdup(tmp);
+ kmem_strfree(tmp);
+ } else {
+ oldvdpath = spa_strdup(oldvd->vdev_path);
+ }
+ newvdpath = spa_strdup(newvd->vdev_path);
+
+ /*
* If this is an in-place replacement, update oldvd's path and devid
* to make it distinguishable from newvd, and unopenable from now on.
*/
- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ if (strcmp(oldvdpath, newvdpath) == 0) {
spa_strfree(oldvd->vdev_path);
- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
KM_SLEEP);
- (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
- "%s/%s", newvd->vdev_path, "old");
+ (void) sprintf(oldvd->vdev_path, "%s/old",
+ newvdpath);
if (oldvd->vdev_devid != NULL) {
spa_strfree(oldvd->vdev_devid);
oldvd->vdev_devid = NULL;
}
+ spa_strfree(oldvdpath);
+ oldvdpath = spa_strdup(oldvd->vdev_path);
}
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
*/
- if (pvd->vdev_ops != pvops)
+ if (!raidz && pvd->vdev_ops != pvops) {
pvd = vdev_add_parent(oldvd, pvops);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+ }
ASSERT(pvd->vdev_top->vdev_parent == rvd);
- ASSERT(pvd->vdev_ops == pvops);
- ASSERT(oldvd->vdev_parent == pvd);
/*
* Extract the new device from its root and add it to pvd.
@@ -6870,41 +7637,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING,
- TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+ if (raidz) {
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
- if (newvd->vdev_isspare) {
- spa_spare_activate(newvd);
- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
- }
+ vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
+ vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
+ vdev_autotrim_stop_wait(tvd);
- oldvdpath = spa_strdup(oldvd->vdev_path);
- newvdpath = spa_strdup(newvd->vdev_path);
- newvd_isspare = newvd->vdev_isspare;
+ dtl_max_txg = spa_vdev_config_enter(spa);
- /*
- * Mark newvd's DTL dirty in this txg.
- */
- vdev_dirty(tvd, VDD_DTL, newvd, txg);
+ tvd->vdev_rz_expanding = B_TRUE;
- /*
- * Schedule the resilver or rebuild to restart in the future. We do
- * this to ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets.
- */
- if (rebuild) {
- newvd->vdev_rebuild_txg = txg;
+ vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
+ vdev_config_dirty(tvd);
- vdev_rebuild(tvd);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+ dtl_max_txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+ newvd, tx);
+ dmu_tx_commit(tx);
} else {
- newvd->vdev_resilver_txg = txg;
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
+
+ if (newvd->vdev_isspare) {
+ spa_spare_activate(newvd);
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+ }
+
+ newvd_isspare = newvd->vdev_isspare;
- if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
- spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
- vdev_defer_resilver(newvd);
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+ /*
+ * Schedule the resilver or rebuild to restart in the future.
+ * We do this to ensure that dmu_sync-ed blocks have been
+ * stitched into the respective datasets.
+ */
+ if (rebuild) {
+ newvd->vdev_rebuild_txg = txg;
+
+ vdev_rebuild(tvd);
} else {
- dsl_scan_restart_resilver(spa->spa_dsl_pool,
- dtl_max_txg);
+ newvd->vdev_resilver_txg = txg;
+
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+ spa_feature_is_enabled(spa,
+ SPA_FEATURE_RESILVER_DEFER)) {
+ vdev_defer_resilver(newvd);
+ } else {
+ dsl_scan_restart_resilver(spa->spa_dsl_pool,
+ dtl_max_txg);
+ }
}
}
@@ -6934,7 +7726,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
* Detach a device from a mirror or replacing vdev.
*
* If 'replace_done' is specified, only detach if the parent
- * is a replacing vdev.
+ * is a replacing or a spare vdev.
*/
int
spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
@@ -7073,7 +7865,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
* it may be that the unwritability of the disk is the reason
* it's being detached!
*/
- error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
/*
* Remove vd from its parent and compact the parent's children.
@@ -7229,7 +8021,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
*/
if (cmd_type == POOL_INITIALIZE_START &&
(vd->vdev_initialize_thread != NULL ||
- vd->vdev_top->vdev_removing)) {
+ vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
mutex_exit(&vd->vdev_initialize_lock);
return (SET_ERROR(EBUSY));
} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
@@ -7241,6 +8033,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
mutex_exit(&vd->vdev_initialize_lock);
return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_UNINIT &&
+ vd->vdev_initialize_thread != NULL) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ return (SET_ERROR(EBUSY));
}
switch (cmd_type) {
@@ -7253,6 +8049,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
case POOL_INITIALIZE_SUSPEND:
vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
break;
+ case POOL_INITIALIZE_UNINIT:
+ vdev_uninitialize(vd);
+ break;
default:
panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
}
@@ -7344,7 +8143,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
* which has completed but the thread is not exited.
*/
if (cmd_type == POOL_TRIM_START &&
- (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+ (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
+ vd->vdev_top->vdev_rz_expanding)) {
mutex_exit(&vd->vdev_trim_lock);
return (SET_ERROR(EBUSY));
} else if (cmd_type == POOL_TRIM_CANCEL &&
@@ -7432,7 +8232,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
* Split a set of devices from their mirrors, and create a new pool from them.
*/
int
-spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
nvlist_t *props, boolean_t exp)
{
int error = 0;
@@ -7441,7 +8241,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
uint_t c, children, lastlog;
nvlist_t **child, *nvl, *tmp;
dmu_tx_t *tx;
- char *altroot = NULL;
+ const char *altroot = NULL;
vdev_t *rvd, **vml = NULL; /* vdev modify list */
boolean_t activate_slog;
@@ -7986,6 +8786,7 @@ spa_scan_stop(spa_t *spa)
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
if (dsl_scan_resilvering(spa->spa_dsl_pool))
return (SET_ERROR(EBUSY));
+
return (dsl_scan_cancel(spa->spa_dsl_pool));
}
@@ -8011,6 +8812,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
return (0);
}
+ if (func == POOL_SCAN_ERRORSCRUB &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
+ return (SET_ERROR(ENOTSUP));
+
return (dsl_scan(spa->spa_dsl_pool, func));
}
@@ -8049,15 +8854,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}
static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
- if (vd->vdev_probe_wanted) {
- vd->vdev_probe_wanted = B_FALSE;
- vdev_reopen(vd); /* vdev_open() does the actual probe */
+ if (vd->vdev_fault_wanted) {
+ vd->vdev_fault_wanted = B_FALSE;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
- spa_async_probe(spa, vd->vdev_child[c]);
+ spa_async_fault_vdev(spa, vd->vdev_child[c]);
}
static void
@@ -8077,7 +8883,7 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd)
spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
}
-static void
+static __attribute__((noreturn)) void
spa_async_thread(void *arg)
{
spa_t *spa = (spa_t *)arg;
@@ -8145,11 +8951,11 @@ spa_async_thread(void *arg)
}
/*
- * See if any devices need to be probed.
+ * See if any devices need to be marked faulted.
*/
- if (tasks & SPA_ASYNC_PROBE) {
+ if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
- spa_async_probe(spa, spa->spa_root_vdev);
+ spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@@ -8157,7 +8963,8 @@ spa_async_thread(void *arg)
* If any devices are done replacing, detach them.
*/
if (tasks & SPA_ASYNC_RESILVER_DONE ||
- tasks & SPA_ASYNC_REBUILD_DONE) {
+ tasks & SPA_ASYNC_REBUILD_DONE ||
+ tasks & SPA_ASYNC_DETACH_SPARE) {
spa_vdev_resilver_done(spa);
}
@@ -8241,6 +9048,10 @@ spa_async_suspend(spa_t *spa)
if (condense_thread != NULL)
zthr_cancel(condense_thread);
+ zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+ if (raidz_expand_thread != NULL)
+ zthr_cancel(raidz_expand_thread);
+
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
if (discard_thread != NULL)
zthr_cancel(discard_thread);
@@ -8267,6 +9078,10 @@ spa_async_resume(spa_t *spa)
if (condense_thread != NULL)
zthr_resume(condense_thread);
+ zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+ if (raidz_expand_thread != NULL)
+ zthr_resume(raidz_expand_thread);
+
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
if (discard_thread != NULL)
zthr_resume(discard_thread);
@@ -8433,7 +9248,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
KM_SLEEP) == 0);
- bzero(packed + nvsize, bufsize - nvsize);
+ memset(packed + nvsize, 0, bufsize - nvsize);
dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
@@ -8472,13 +9287,15 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
nvroot = fnvlist_alloc();
if (sav->sav_count == 0) {
- fnvlist_add_nvlist_array(nvroot, config, NULL, 0);
+ fnvlist_add_nvlist_array(nvroot, config,
+ (const nvlist_t * const *)NULL, 0);
} else {
list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
for (i = 0; i < sav->sav_count; i++)
list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
B_FALSE, VDEV_CONFIG_L2CACHE);
- fnvlist_add_nvlist_array(nvroot, config, list, sav->sav_count);
+ fnvlist_add_nvlist_array(nvroot, config,
+ (const nvlist_t * const *)list, sav->sav_count);
for (i = 0; i < sav->sav_count; i++)
nvlist_free(list[i]);
kmem_free(list, sav->sav_count * sizeof (void *));
@@ -8499,6 +9316,11 @@ spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
{
spa_t *spa = vd->vdev_spa;
+ if (vd->vdev_root_zap != 0 &&
+ spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
+ VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+ vd->vdev_root_zap, tx));
+ }
if (vd->vdev_top_zap != 0) {
VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
vd->vdev_top_zap, tx));
@@ -8659,27 +9481,14 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
while ((elem = nvlist_next_nvpair(nvp, elem))) {
uint64_t intval;
- char *strval, *fname;
+ const char *strval, *fname;
zpool_prop_t prop;
const char *propname;
+ const char *elemname = nvpair_name(elem);
zprop_type_t proptype;
spa_feature_t fid;
- switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
- case ZPOOL_PROP_INVAL:
- /*
- * We checked this earlier in spa_prop_validate().
- */
- ASSERT(zpool_prop_feature(nvpair_name(elem)));
-
- fname = strchr(nvpair_name(elem), '@') + 1;
- VERIFY0(zfeature_lookup_name(fname, &fid));
-
- spa_feature_enable(spa, fid, tx);
- spa_history_log_internal(spa, "set", tx,
- "%s=enabled", nvpair_name(elem));
- break;
-
+ switch (prop = zpool_name_to_prop(elemname)) {
case ZPOOL_PROP_VERSION:
intval = fnvpair_value_uint64(elem);
/*
@@ -8722,7 +9531,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
spa_history_log_internal(spa, "set", tx,
- "%s=%s", nvpair_name(elem), strval);
+ "%s=%s", elemname, strval);
break;
case ZPOOL_PROP_COMPATIBILITY:
strval = fnvpair_value_string(elem);
@@ -8741,6 +9550,20 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
"%s=%s", nvpair_name(elem), strval);
break;
+ case ZPOOL_PROP_INVAL:
+ if (zpool_prop_feature(elemname)) {
+ fname = strchr(elemname, '@') + 1;
+ VERIFY0(zfeature_lookup_name(fname, &fid));
+
+ spa_feature_enable(spa, fid, tx);
+ spa_history_log_internal(spa, "set", tx,
+ "%s=enabled", elemname);
+ break;
+ } else if (!zfs_prop_user(elemname)) {
+ ASSERT(zpool_prop_feature(elemname));
+ break;
+ }
+ zfs_fallthrough;
default:
/*
* Set pool property values in the poolprops mos object.
@@ -8753,8 +9576,13 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
}
/* normalize the property name */
- propname = zpool_prop_to_name(prop);
- proptype = zpool_prop_get_type(prop);
+ if (prop == ZPOOL_PROP_INVAL) {
+ propname = elemname;
+ proptype = PROP_TYPE_STRING;
+ } else {
+ propname = zpool_prop_to_name(prop);
+ proptype = zpool_prop_get_type(prop);
+ }
if (nvpair_type(elem) == DATA_TYPE_STRING) {
ASSERT(proptype == PROP_TYPE_STRING);
@@ -8763,7 +9591,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
spa->spa_pool_props_object, propname,
1, strlen(strval) + 1, strval, tx));
spa_history_log_internal(spa, "set", tx,
- "%s=%s", nvpair_name(elem), strval);
+ "%s=%s", elemname, strval);
} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
intval = fnvpair_value_uint64(elem);
@@ -8776,38 +9604,38 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
spa->spa_pool_props_object, propname,
8, 1, &intval, tx));
spa_history_log_internal(spa, "set", tx,
- "%s=%lld", nvpair_name(elem),
+ "%s=%lld", elemname,
(longlong_t)intval);
- } else {
- ASSERT(0); /* not allowed */
- }
- switch (prop) {
- case ZPOOL_PROP_DELEGATION:
- spa->spa_delegation = intval;
- break;
- case ZPOOL_PROP_BOOTFS:
- spa->spa_bootfs = intval;
- break;
- case ZPOOL_PROP_FAILUREMODE:
- spa->spa_failmode = intval;
- break;
- case ZPOOL_PROP_AUTOTRIM:
- spa->spa_autotrim = intval;
- spa_async_request(spa,
- SPA_ASYNC_AUTOTRIM_RESTART);
- break;
- case ZPOOL_PROP_AUTOEXPAND:
- spa->spa_autoexpand = intval;
- if (tx->tx_txg != TXG_INITIAL)
+ switch (prop) {
+ case ZPOOL_PROP_DELEGATION:
+ spa->spa_delegation = intval;
+ break;
+ case ZPOOL_PROP_BOOTFS:
+ spa->spa_bootfs = intval;
+ break;
+ case ZPOOL_PROP_FAILUREMODE:
+ spa->spa_failmode = intval;
+ break;
+ case ZPOOL_PROP_AUTOTRIM:
+ spa->spa_autotrim = intval;
spa_async_request(spa,
- SPA_ASYNC_AUTOEXPAND);
- break;
- case ZPOOL_PROP_MULTIHOST:
- spa->spa_multihost = intval;
- break;
- default:
- break;
+ SPA_ASYNC_AUTOTRIM_RESTART);
+ break;
+ case ZPOOL_PROP_AUTOEXPAND:
+ spa->spa_autoexpand = intval;
+ if (tx->tx_txg != TXG_INITIAL)
+ spa_async_request(spa,
+ SPA_ASYNC_AUTOEXPAND);
+ break;
+ case ZPOOL_PROP_MULTIHOST:
+ spa->spa_multihost = intval;
+ break;
+ default:
+ break;
+ }
+ } else {
+ ASSERT(0); /* not allowed */
}
}
@@ -9045,8 +9873,10 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
&spa->spa_deferred_bpobj, tx);
}
+ brt_sync(spa, txg);
ddt_sync(spa, txg);
dsl_scan_sync(dp, tx);
+ dsl_errorscrub_sync(dp, tx);
svr_sync(spa, tx);
spa_sync_upgrades(spa, tx);
@@ -9057,6 +9887,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
!= NULL)
vdev_sync(vd, txg);
+ if (pass == 1) {
+ /*
+ * dsl_pool_sync() -> dp_sync_tasks may have dirtied
+ * the config. If that happens, this txg should not
+ * be a no-op. So we must sync the config to the MOS
+ * before checking for no-op.
+ *
+ * Note that when the config is dirty, it will
+ * be written to the MOS (i.e. the MOS will be
+ * dirtied) every time we call spa_sync_config_object()
+ * in this txg. Therefore we can't call this after
+ * dsl_pool_sync() every pass, because it would
+ * prevent us from converging, since we'd dirty
+ * the MOS every pass.
+ *
+ * Sync tasks can only be processed in pass 1, so
+ * there's no need to do this in later passes.
+ */
+ spa_sync_config_object(spa, tx);
+ }
+
/*
* Note: We need to check if the MOS is dirty because we could
* have marked the MOS dirty without updating the uberblock
@@ -9067,7 +9918,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
* don't want to rely on that here).
*/
if (pass == 1 &&
- spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
!dmu_objset_is_dirty(mos, txg)) {
/*
* Nothing changed on the first pass, therefore this
@@ -9170,6 +10021,13 @@ spa_sync(spa_t *spa, uint64_t txg)
ZIO_FLAG_CANFAIL);
/*
+ * Now that there can be no more cloning in this transaction group,
+ * but we are still before issuing frees, we can process pending BRT
+ * updates.
+ */
+ brt_pending_apply(spa, txg);
+
+ /*
* Lock out configuration changes.
*/
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -9188,7 +10046,13 @@ spa_sync(spa_t *spa, uint64_t txg)
* into config changes that go out with this transaction group.
*/
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
- while (list_head(&spa->spa_state_dirty_list) != NULL) {
+ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+ /* Avoid holding the write lock unless actually necessary */
+ if (vd->vdev_aux == NULL) {
+ vdev_state_clean(vd);
+ vdev_config_dirty(vd);
+ continue;
+ }
/*
* We need the write lock here because, for aux vdevs,
* calling vdev_config_dirty() modifies sav_config.
@@ -9304,11 +10168,17 @@ spa_sync(spa_t *spa, uint64_t txg)
metaslab_class_evict_old(spa->spa_normal_class, txg);
metaslab_class_evict_old(spa->spa_log_class, txg);
+ /* spa_embedded_log_class has only one metaslab per vdev. */
+ metaslab_class_evict_old(spa->spa_special_class, txg);
+ metaslab_class_evict_old(spa->spa_dedup_class, txg);
spa_sync_close_syncing_log_sm(spa);
spa_update_dspace(spa);
+ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
+ vdev_autotrim_kick(spa);
+
/*
* It had better be the case that we didn't dirty anything
* since vdev_config_sync().
@@ -9362,6 +10232,132 @@ spa_sync_allpools(void)
mutex_exit(&spa_namespace_lock);
}
+taskq_t *
+spa_sync_tq_create(spa_t *spa, const char *name)
+{
+ kthread_t **kthreads;
+
+ ASSERT(spa->spa_sync_tq == NULL);
+ ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
+
+ /*
+ * - do not allow more allocators than cpus.
+ * - there may be more cpus than allocators.
+ * - do not allow more sync taskq threads than allocators or cpus.
+ */
+ int nthreads = spa->spa_alloc_count;
+ spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
+ nthreads, KM_SLEEP);
+
+ spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
+ nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
+ VERIFY(spa->spa_sync_tq != NULL);
+ VERIFY(kthreads != NULL);
+
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (int i = 0; i < nthreads; i++, ti++) {
+ ti->sti_thread = kthreads[i];
+ ti->sti_allocator = i;
+ }
+
+ kmem_free(kthreads, sizeof (*kthreads) * nthreads);
+ return (spa->spa_sync_tq);
+}
+
+void
+spa_sync_tq_destroy(spa_t *spa)
+{
+ ASSERT(spa->spa_sync_tq != NULL);
+
+ taskq_wait(spa->spa_sync_tq);
+ taskq_destroy(spa->spa_sync_tq);
+ kmem_free(spa->spa_syncthreads,
+ sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
+ spa->spa_sync_tq = NULL;
+}
+
+uint_t
+spa_acq_allocator(spa_t *spa)
+{
+ int i;
+
+ if (spa->spa_alloc_count == 1)
+ return (0);
+
+ mutex_enter(&spa->spa_allocs_use->sau_lock);
+ uint_t r = spa->spa_allocs_use->sau_rotor;
+ do {
+ if (++r == spa->spa_alloc_count)
+ r = 0;
+ } while (spa->spa_allocs_use->sau_inuse[r]);
+ spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
+ spa->spa_allocs_use->sau_rotor = r;
+ mutex_exit(&spa->spa_allocs_use->sau_lock);
+
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
+ if (ti->sti_thread == curthread) {
+ ti->sti_allocator = r;
+ break;
+ }
+ }
+ ASSERT3S(i, <, spa->spa_alloc_count);
+ return (r);
+}
+
+void
+spa_rel_allocator(spa_t *spa, uint_t allocator)
+{
+ if (spa->spa_alloc_count > 1)
+ spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
+}
+
+void
+spa_select_allocator(zio_t *zio)
+{
+ zbookmark_phys_t *bm = &zio->io_bookmark;
+ spa_t *spa = zio->io_spa;
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+ /*
+ * A gang block (for example) may have inherited its parent's
+ * allocator, in which case there is nothing further to do here.
+ */
+ if (ZIO_HAS_ALLOCATOR(zio))
+ return;
+
+ ASSERT(spa != NULL);
+ ASSERT(bm != NULL);
+
+ /*
+ * First try to use an allocator assigned to the syncthread, and set
+ * the corresponding write issue taskq for the allocator.
+ * Note, we must have an open pool to do this.
+ */
+ if (spa->spa_sync_tq != NULL) {
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
+ if (ti->sti_thread == curthread) {
+ zio->io_allocator = ti->sti_allocator;
+ return;
+ }
+ }
+ }
+
+ /*
+ * We want to try to use as many allocators as possible to help improve
+ * performance, but we also want logically adjacent IOs to be physically
+ * adjacent to improve sequential read performance. We chunk each object
+ * into 2^20 block regions, and then hash based on the objset, object,
+ * level, and region to accomplish both of these goals.
+ */
+ uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
+ bm->zb_blkid >> 20);
+
+ zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
+}
+
/*
* ==========================================================================
* Miscellaneous routines
@@ -9454,6 +10450,7 @@ spa_upgrade(spa_t *spa, uint64_t version)
static boolean_t
spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
{
+ (void) spa;
int i;
uint64_t vdev_guid;
@@ -9699,9 +10696,10 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
DSS_SCANNING);
break;
case ZPOOL_WAIT_RESILVER:
- if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+ *in_progress = vdev_rebuild_active(spa->spa_root_vdev);
+ if (*in_progress)
break;
- fallthrough;
+ zfs_fallthrough;
case ZPOOL_WAIT_SCRUB:
{
boolean_t scanning, paused, is_scrub;
@@ -9714,6 +10712,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
is_scrub == (activity == ZPOOL_WAIT_SCRUB));
break;
}
+ case ZPOOL_WAIT_RAIDZ_EXPAND:
+ {
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
+ break;
+ }
default:
panic("unrecognized value for activity %d", activity);
}
@@ -9817,6 +10821,8 @@ spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
ev->resource = resource;
}
+#else
+ (void) spa, (void) vd, (void) hist_nvl, (void) name;
#endif
return (ev);
}
@@ -9829,6 +10835,8 @@ spa_event_post(sysevent_t *ev)
zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
kmem_free(ev, sizeof (*ev));
}
+#else
+ (void) ev;
#endif
}
@@ -9900,10 +10908,14 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
/* asynchronous event notification */
EXPORT_SYMBOL(spa_event_notify);
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
+ "Percentage of CPUs to run a metaslab preload taskq");
+
/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
"log2 fraction of arc that can be used by inflight I/Os when "
"verifying pool during import");
+/* END CSTYLED */
ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
"Set to traverse metadata on pool import");
@@ -9914,29 +10926,47 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
"Print vdev tree to zfs_dbgmsg during pool import");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
"Percentage of CPUs to run an IO worker thread");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
"Number of threads per IO worker taskqueue");
-ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
"Allow importing pool with up to this number of missing top-level "
"vdevs (in read-only mode)");
+/* END CSTYLED */
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW,
- "Set the livelist condense zthr to pause");
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
+ ZMOD_RW, "Set the livelist condense zthr to pause");
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW,
- "Set the livelist condense synctask to pause");
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
+ ZMOD_RW, "Set the livelist condense synctask to pause");
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
+ INT, ZMOD_RW,
"Whether livelist condensing was canceled in the synctask");
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
+ INT, ZMOD_RW,
"Whether livelist condensing was canceled in the zthr function");
-ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
+ ZMOD_RW,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
+
+#ifdef _KERNEL
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
+ spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
+ "Configure IO queues for read IO");
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
+ spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
+ "Configure IO queues for write IO");
+#endif
/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
+ "Number of CPUs per write issue taskq");
diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
index 09f62996853d..1efff47f87a0 100644
--- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
+++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -158,7 +158,7 @@
* amount of checkpointed data that has been freed within them while
* the pool had a checkpoint.
*/
-unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
int
spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
@@ -166,7 +166,7 @@ spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
- bzero(pcs, sizeof (pool_checkpoint_stat_t));
+ memset(pcs, 0, sizeof (pool_checkpoint_stat_t));
int error = zap_contains(spa_meta_objset(spa),
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
@@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
if (error != 0) {
zfs_panic_recover("zfs: error %lld was returned "
"while incrementally destroying the checkpoint "
- "space map of vdev %u\n",
+ "space map of vdev %llu\n",
(longlong_t)error, vd->vdev_id);
}
ASSERT0(words_after);
@@ -380,10 +380,10 @@ spa_checkpoint_discard_is_done(spa_t *spa)
return (B_TRUE);
}
-/* ARGSUSED */
boolean_t
spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
{
+ (void) zthr;
spa_t *spa = arg;
if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
@@ -450,10 +450,10 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
}
-/* ARGSUSED */
static int
spa_checkpoint_check(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
@@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx)
if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+ if (spa->spa_raidz_expand != NULL)
+ return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+
if (spa->spa_checkpoint_txg != 0)
return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
@@ -474,10 +477,10 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
static void
spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
spa_t *spa = dp->dp_spa;
uberblock_t checkpoint = spa->spa_ubsync;
@@ -571,10 +574,10 @@ spa_checkpoint(const char *pool)
return (error);
}
-/* ARGSUSED */
static int
spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
@@ -589,10 +592,10 @@ spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
return (0);
}
-/* ARGSUSED */
static void
spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
@@ -631,7 +634,7 @@ EXPORT_SYMBOL(spa_checkpoint_discard_thread);
EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW,
"Limit for memory used in prefetching the checkpoint space map done "
"on each vdev while discarding the checkpoint");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
index ad82932ce567..a77874ea0dd3 100644
--- a/sys/contrib/openzfs/module/zfs/spa_config.c
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -67,8 +67,10 @@ static uint64_t spa_config_generation = 1;
* This can be overridden in userland to preserve an alternate namespace for
* userland pools when doing testing.
*/
-char *spa_config_path = ZPOOL_CACHE;
-int zfs_autoimport_disable = 1;
+char *spa_config_path = (char *)ZPOOL_CACHE;
+#ifdef _KERNEL
+static int zfs_autoimport_disable = B_TRUE;
+#endif
/*
* Called when the module is first loaded, this routine loads the configuration
@@ -238,11 +240,12 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
* would be required.
*/
void
-spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent,
+ boolean_t postblkidevent)
{
spa_config_dirent_t *dp, *tdp;
nvlist_t *nvl;
- char *pool_name;
+ const char *pool_name;
boolean_t ccw_failure;
int error = 0;
@@ -344,6 +347,18 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
if (postsysevent)
spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
+
+ /*
+ * Post udev event to sync blkid information if the pool is created
+ * or a new vdev is added to the pool.
+ */
+ if ((target->spa_root_vdev) && postblkidevent) {
+ vdev_post_kobj_evt(target->spa_root_vdev);
+ for (int i = 0; i < target->spa_l2cache.sav_count; i++)
+ vdev_post_kobj_evt(target->spa_l2cache.sav_vdevs[i]);
+ for (int i = 0; i < target->spa_spares.sav_count; i++)
+ vdev_post_kobj_evt(target->spa_spares.sav_vdevs[i]);
+ }
}
/*
@@ -352,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
* So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
* information for all pool visible within the zone.
*/
-nvlist_t *
-spa_all_configs(uint64_t *generation)
+int
+spa_all_configs(uint64_t *generation, nvlist_t **pools)
{
- nvlist_t *pools;
spa_t *spa = NULL;
if (*generation == spa_config_generation)
- return (NULL);
+ return (SET_ERROR(EEXIST));
- pools = fnvlist_alloc();
+ int error = mutex_enter_interruptible(&spa_namespace_lock);
+ if (error)
+ return (SET_ERROR(EINTR));
- mutex_enter(&spa_namespace_lock);
+ *pools = fnvlist_alloc();
while ((spa = spa_next(spa)) != NULL) {
if (INGLOBALZONE(curproc) ||
zone_dataset_visible(spa_name(spa), NULL)) {
mutex_enter(&spa->spa_props_lock);
- fnvlist_add_nvlist(pools, spa_name(spa),
+ fnvlist_add_nvlist(*pools, spa_name(spa),
spa->spa_config);
mutex_exit(&spa->spa_props_lock);
}
@@ -376,7 +392,7 @@ spa_all_configs(uint64_t *generation)
*generation = spa_config_generation;
mutex_exit(&spa_namespace_lock);
- return (pools);
+ return (0);
}
void
@@ -403,7 +419,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
unsigned long hostid = 0;
boolean_t locked = B_FALSE;
uint64_t split_guid;
- char *pool_name;
+ const char *pool_name;
if (vd == NULL) {
vd = rvd;
@@ -598,6 +614,7 @@ spa_config_update(spa_t *spa, int what)
*/
if (!spa->spa_is_root) {
spa_write_cachefile(spa, B_FALSE,
+ what != SPA_CONFIG_UPDATE_POOL,
what != SPA_CONFIG_UPDATE_POOL);
}
@@ -611,7 +628,6 @@ EXPORT_SYMBOL(spa_config_set);
EXPORT_SYMBOL(spa_config_generate);
EXPORT_SYMBOL(spa_config_update);
-/* BEGIN CSTYLED */
#ifdef __linux__
/* string sysctls require a char array on FreeBSD */
ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
@@ -620,4 +636,3 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
"Disable pool import at module load");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c
index fa5120eb61b3..62d7b4fa2df2 100644
--- a/sys/contrib/openzfs/module/zfs/spa_errlog.c
+++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,7 +20,9 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014, Delphix. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2021, 2022, George Amanakis. All rights reserved.
*/
/*
@@ -43,6 +45,16 @@
* calculation when the data is requested, storing the result so future queries
* will be faster.
*
+ * If the head_errlog feature is enabled, a different on-disk format is used.
+ * The error log of each head dataset is stored separately in the zap object
+ * and keyed by the head id. This enables listing every dataset affected in
+ * userland. In order to be able to track whether an error block has been
+ * modified or added to snapshots since it was marked as an error, a new tuple
+ * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
+ * transaction group of an error block on-disk. The birth transaction group is
+ * used by check_filesystem() to assess whether this block was freed,
+ * re-written or added to a snapshot since its marking as an error.
+ *
* This log is then shipped into an nvlist where the key is the dataset name and
* the value is the object name. Userland is then responsible for uniquifying
* this list and displaying it to the user.
@@ -53,7 +65,25 @@
#include <sys/spa_impl.h>
#include <sys/zap.h>
#include <sys/zio.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+#include <sys/zfs_znode.h>
+
+#define NAME_MAX_LEN 64
+
+typedef struct clones {
+ uint64_t clone_ds;
+ list_node_t node;
+} clones_t;
+/*
+ * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
+ * of on-disk error log entries that will be converted to the new
+ * format when enabling head_errlog. Defaults to 0 which converts
+ * all log entries.
+ */
+static uint_t spa_upgrade_errlog_limit = 0;
/*
* Convert a bookmark to a string.
@@ -67,9 +97,35 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
}
/*
- * Convert a string to a bookmark
+ * Convert an err_phys to a string.
+ */
+static void
+errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
+ (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
+}
+
+/*
+ * Convert a string to a err_phys.
+ */
+void
+name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
+{
+ zep->zb_object = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zep->zb_birth = zfs_strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+
+/*
+ * Convert a string to a bookmark.
*/
-#ifdef _KERNEL
static void
name_to_bookmark(char *buf, zbookmark_phys_t *zb)
{
@@ -82,7 +138,41 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
ASSERT(*buf == '\0');
}
-#endif
+
+void
+zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
+{
+ zb->zb_objset = dataset;
+ zb->zb_object = zep->zb_object;
+ zb->zb_level = zep->zb_level;
+ zb->zb_blkid = zep->zb_blkid;
+}
+
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+ *obj = zfs_strtonum(buf, &buf);
+ ASSERT(*buf == '\0');
+}
+
+/*
+ * Retrieve the head filesystem.
+ */
+static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds)
+{
+ dsl_dataset_t *ds;
+ int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool,
+ dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+
+ if (error != 0)
+ return (error);
+
+ ASSERT(head_ds);
+ *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+ return (error);
+}
/*
* Log an uncorrectable error to the persistent error log. We add it to the
@@ -90,7 +180,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
* during spa_errlog_sync().
*/
void
-spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth)
{
spa_error_entry_t search;
spa_error_entry_t *new;
@@ -123,96 +213,856 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
new->se_bookmark = *zb;
- avl_insert(tree, new, where);
+ /*
+ * If the head_errlog feature is enabled, store the birth txg now. In
+ * case the file is deleted before spa_errlog_sync() runs, we will not
+ * be able to retrieve the birth txg.
+ */
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ new->se_zep.zb_object = zb->zb_object;
+ new->se_zep.zb_level = zb->zb_level;
+ new->se_zep.zb_blkid = zb->zb_blkid;
+ new->se_zep.zb_birth = birth;
+ }
+
+ avl_insert(tree, new, where);
mutex_exit(&spa->spa_errlist_lock);
}
+int
+find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
+ uint64_t *birth_txg)
+{
+ objset_t *os;
+ int error = dmu_objset_from_ds(ds, &os);
+ if (error != 0)
+ return (error);
+
+ dnode_t *dn;
+ blkptr_t bp;
+
+ error = dnode_hold(os, zep->zb_object, FTAG, &dn);
+ if (error != 0)
+ return (error);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
+ NULL);
+ if (error == 0 && BP_IS_HOLE(&bp))
+ error = SET_ERROR(ENOENT);
+
+ *birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ return (error);
+}
+
+/*
+ * This function finds the oldest affected filesystem containing an error
+ * block.
+ */
+int
+find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ uint64_t *top_affected_fs)
+{
+ uint64_t oldest_dsobj;
+ int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
+ &oldest_dsobj);
+ if (error != 0)
+ return (error);
+
+ dsl_dataset_t *ds;
+ error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ *top_affected_fs =
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ return (0);
+}
+
+
+#ifdef _KERNEL
+/*
+ * Copy the bookmark to the end of the user-space buffer which starts at
+ * uaddr and has *count unused entries, and decrement *count by 1.
+ */
+static int
+copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
+{
+ if (*count == 0)
+ return (SET_ERROR(ENOMEM));
+
+ *count -= 1;
+ if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0)
+ return (SET_ERROR(EFAULT));
+ return (0);
+}
+
/*
- * Return the number of errors currently in the error log. This is actually the
- * sum of both the last log and the current log, since we don't know the union
- * of these logs until we reach userland.
+ * Each time the error block is referenced by a snapshot or clone, add a
+ * zbookmark_phys_t entry to the userspace array at uaddr. The array is
+ * filled from the back and the in-out parameter *count is modified to be the
+ * number of unused entries at the beginning of the array. The function
+ * scrub_filesystem() is modelled after this one.
*/
+static int
+check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ void *uaddr, uint64_t *count, list_t *clones_list)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj_flags(dp, head_ds,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t latest_txg;
+ uint64_t txg_to_consider = spa->spa_syncing_txg;
+ boolean_t check_snapshot = B_TRUE;
+ error = find_birth_txg(ds, zep, &latest_txg);
+
+ /*
+ * If find_birth_txg() errors out otherwise, let txg_to_consider be
+ * equal to the spa's syncing txg: if check_filesystem() errors out
+ * then affected snapshots or clones will not be checked.
+ */
+ if (error == 0 && zep->zb_birth == latest_txg) {
+ /* Block neither free nor rewritten. */
+ zbookmark_phys_t zb;
+ zep_to_zb(head_ds, zep, &zb);
+ error = copyout_entry(&zb, uaddr, count);
+ if (error != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ return (error);
+ }
+ check_snapshot = B_FALSE;
+ } else if (error == 0) {
+ txg_to_consider = latest_txg;
+ }
+
+ /*
+ * Retrieve the number of snapshots if the dataset is not a snapshot.
+ */
+ uint64_t snap_count = 0;
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+
+ error = zap_count(spa->spa_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+
+ if (error != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ return (error);
+ }
+ }
+
+ if (snap_count == 0) {
+ /* Filesystem without snapshots. */
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ return (0);
+ }
+
+ uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t),
+ KM_SLEEP);
+
+ int aff_snap_count = 0;
+ uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones;
+
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+ /* Check only snapshots created from this file system. */
+ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+ snap_obj_txg <= txg_to_consider) {
+
+ error = dsl_dataset_hold_obj_flags(dp, snap_obj,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+ if (error != 0)
+ goto out;
+
+ if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) {
+ snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ continue;
+ }
+
+ boolean_t affected = B_TRUE;
+ if (check_snapshot) {
+ uint64_t blk_txg;
+ error = find_birth_txg(ds, zep, &blk_txg);
+ affected = (error == 0 && zep->zb_birth == blk_txg);
+ }
+
+ /* Report errors in snapshots. */
+ if (affected) {
+ snap_obj_array[aff_snap_count] = snap_obj;
+ aff_snap_count++;
+
+ zbookmark_phys_t zb;
+ zep_to_zb(snap_obj, zep, &zb);
+ error = copyout_entry(&zb, uaddr, count);
+ if (error != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
+ FTAG);
+ goto out;
+ }
+ }
+ snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ }
+
+ if (zap_clone == 0 || aff_snap_count == 0) {
+ error = 0;
+ goto out;
+ }
+
+ /* Check clones. */
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+
+ zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+
+ dsl_dataset_t *clone;
+ error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &clone);
+
+ if (error != 0)
+ break;
+
+ /*
+ * Only clones whose origins were affected could also
+ * have affected snapshots.
+ */
+ boolean_t found = B_FALSE;
+ for (int i = 0; i < snap_count; i++) {
+ if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj
+ == snap_obj_array[i])
+ found = B_TRUE;
+ }
+ dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+ if (!found)
+ continue;
+
+ clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
+ ct->clone_ds = za->za_first_integer;
+ list_insert_tail(clones_list, ct);
+ }
+
+ zap_cursor_fini(zc);
+ kmem_free(za, sizeof (*za));
+ kmem_free(zc, sizeof (*zc));
+
+out:
+ kmem_free(snap_obj_array, sizeof (*snap_obj_array));
+ return (error);
+}
+
+static int
+process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+ void *uaddr, uint64_t *count)
+{
+ /*
+ * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the
+ * birth txg or the head filesystem of the block pointer. This may
+ * happen e.g. when an encrypted filesystem is not mounted or when
+ * the key is not loaded. In this case do not proceed to
+ * check_filesystem(), instead do the accounting here.
+ */
+ if (zep->zb_birth == 0 || head_ds == 0) {
+ zbookmark_phys_t zb;
+ zep_to_zb(head_ds, zep, &zb);
+ int error = copyout_entry(&zb, uaddr, count);
+ if (error != 0) {
+ return (error);
+ }
+ return (0);
+ }
+
+ uint64_t top_affected_fs;
+ uint64_t init_count = *count;
+ int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
+ if (error == 0) {
+ clones_t *ct;
+ list_t clones_list;
+
+ list_create(&clones_list, sizeof (clones_t),
+ offsetof(clones_t, node));
+
+ error = check_filesystem(spa, top_affected_fs, zep,
+ uaddr, count, &clones_list);
+
+ while ((ct = list_remove_head(&clones_list)) != NULL) {
+ error = check_filesystem(spa, ct->clone_ds, zep,
+ uaddr, count, &clones_list);
+ kmem_free(ct, sizeof (*ct));
+
+ if (error) {
+ while (!list_is_empty(&clones_list)) {
+ ct = list_remove_head(&clones_list);
+ kmem_free(ct, sizeof (*ct));
+ }
+ break;
+ }
+ }
+
+ list_destroy(&clones_list);
+ }
+ if (error == 0 && init_count == *count) {
+ /*
+ * If we reach this point, no errors have been detected
+ * in the checked filesystems/snapshots. Before returning mark
+ * the error block to be removed from the error lists and logs.
+ */
+ zbookmark_phys_t zb;
+ zep_to_zb(head_ds, zep, &zb);
+ spa_remove_error(spa, &zb, zep->zb_birth);
+ }
+
+ return (error);
+}
+#endif
+
+/* Return the number of errors in the error log */
uint64_t
-spa_get_errlog_size(spa_t *spa)
+spa_get_last_errlog_size(spa_t *spa)
{
uint64_t total = 0, count;
-
mutex_enter(&spa->spa_errlog_lock);
- if (spa->spa_errlog_scrub != 0 &&
- zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
- &count) == 0)
- total += count;
- if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ if (spa->spa_errlog_last != 0 &&
zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
&count) == 0)
total += count;
mutex_exit(&spa->spa_errlog_lock);
+ return (total);
+}
+
+/*
+ * If a healed bookmark matches an entry in the error log we stash it in a tree
+ * so that we can later remove the related log entries in sync context.
+ */
+static void
+spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
+ const uint64_t birth)
+{
+ char name[NAME_MAX_LEN];
+
+ if (obj == 0)
+ return;
+
+ boolean_t held_list = B_FALSE;
+ boolean_t held_log = B_FALSE;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ bookmark_to_name(healed_zb, name, sizeof (name));
+
+ if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset,
+ name) == 0) {
+ if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
+ mutex_enter(&spa->spa_errlog_lock);
+ held_log = B_TRUE;
+ }
+
+ /*
+ * Found an error matching healed zb, add zb to our
+ * tree of healed errors
+ */
+ avl_tree_t *tree = &spa->spa_errlist_healed;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_index_t where;
+ search.se_bookmark = *healed_zb;
+ if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
+ mutex_enter(&spa->spa_errlist_lock);
+ held_list = B_TRUE;
+ }
+ if (avl_find(tree, &search, &where) != NULL) {
+ if (held_list)
+ mutex_exit(&spa->spa_errlist_lock);
+ if (held_log)
+ mutex_exit(&spa->spa_errlog_lock);
+ return;
+ }
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *healed_zb;
+ avl_insert(tree, new, where);
+ if (held_list)
+ mutex_exit(&spa->spa_errlist_lock);
+ if (held_log)
+ mutex_exit(&spa->spa_errlog_lock);
+ }
+ return;
+ }
+
+ zbookmark_err_phys_t healed_zep;
+ healed_zep.zb_object = healed_zb->zb_object;
+ healed_zep.zb_level = healed_zb->zb_level;
+ healed_zep.zb_blkid = healed_zb->zb_blkid;
+ healed_zep.zb_birth = birth;
+
+ errphys_to_name(&healed_zep, name, sizeof (name));
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+ if (zap_contains(spa->spa_meta_objset, za.za_first_integer,
+ name) == 0) {
+ if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
+ mutex_enter(&spa->spa_errlog_lock);
+ held_log = B_TRUE;
+ }
+
+ avl_tree_t *tree = &spa->spa_errlist_healed;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_index_t where;
+ search.se_bookmark = *healed_zb;
+
+ if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
+ mutex_enter(&spa->spa_errlist_lock);
+ held_list = B_TRUE;
+ }
+
+ if (avl_find(tree, &search, &where) != NULL) {
+ if (held_list)
+ mutex_exit(&spa->spa_errlist_lock);
+ if (held_log)
+ mutex_exit(&spa->spa_errlog_lock);
+ continue;
+ }
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *healed_zb;
+ new->se_zep = healed_zep;
+ avl_insert(tree, new, where);
+
+ if (held_list)
+ mutex_exit(&spa->spa_errlist_lock);
+ if (held_log)
+ mutex_exit(&spa->spa_errlog_lock);
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*
+ * If this error exists in the given tree remove it.
+ */
+static void
+remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb)
+{
+ spa_error_entry_t search, *found;
+ avl_index_t where;
+
+ mutex_enter(&spa->spa_errlist_lock);
+ search.se_bookmark = *zb;
+ if ((found = avl_find(t, &search, &where)) != NULL) {
+ avl_remove(t, found);
+ kmem_free(found, sizeof (spa_error_entry_t));
+ }
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+
+/*
+ * Removes all of the recv healed errors from both on-disk error logs
+ */
+static void
+spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
+{
+ char name[NAME_MAX_LEN];
+ spa_error_entry_t *se;
+ void *cookie = NULL;
+
+ ASSERT(MUTEX_HELD(&spa->spa_errlog_lock));
+
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_healed,
+ &cookie)) != NULL) {
+ remove_error_from_list(spa, s, &se->se_bookmark);
+ remove_error_from_list(spa, l, &se->se_bookmark);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ bookmark_to_name(&se->se_bookmark, name, sizeof (name));
+ (void) zap_remove(spa->spa_meta_objset,
+ spa->spa_errlog_last, name, tx);
+ (void) zap_remove(spa->spa_meta_objset,
+ spa->spa_errlog_scrub, name, tx);
+ } else {
+ errphys_to_name(&se->se_zep, name, sizeof (name));
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_errlog_last);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ zap_remove(spa->spa_meta_objset,
+ za.za_first_integer, name, tx);
+ }
+ zap_cursor_fini(&zc);
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset,
+ spa->spa_errlog_scrub);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ zap_remove(spa->spa_meta_objset,
+ za.za_first_integer, name, tx);
+ }
+ zap_cursor_fini(&zc);
+ }
+ kmem_free(se, sizeof (spa_error_entry_t));
+ }
+}
+
+/*
+ * Stash away healed bookmarks to remove them from the on-disk error logs
+ * later in spa_remove_healed_errors().
+ */
+void
+spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth)
+{
+ spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth);
+ spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth);
+}
+
+static uint64_t
+approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj)
+{
+ if (spa_err_obj == 0)
+ return (0);
+ uint64_t total = 0;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+ uint64_t count;
+ if (zap_count(spa->spa_meta_objset, za.za_first_integer,
+ &count) == 0)
+ total += count;
+ }
+ zap_cursor_fini(&zc);
+ return (total);
+}
+
+/*
+ * Return the approximate number of errors currently in the error log. This
+ * will be nonzero if there are some errors, but otherwise it may be more
+ * or less than the number of entries returned by spa_get_errlog().
+ */
+uint64_t
+spa_approx_errlog_size(spa_t *spa)
+{
+ uint64_t total = 0;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ mutex_enter(&spa->spa_errlog_lock);
+ uint64_t count;
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ } else {
+ mutex_enter(&spa->spa_errlog_lock);
+ total += approx_errlog_size_impl(spa, spa->spa_errlog_last);
+ total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub);
+ mutex_exit(&spa->spa_errlog_lock);
+ }
mutex_enter(&spa->spa_errlist_lock);
total += avl_numnodes(&spa->spa_errlist_last);
total += avl_numnodes(&spa->spa_errlist_scrub);
mutex_exit(&spa->spa_errlist_lock);
-
return (total);
}
-#ifdef _KERNEL
-static int
-process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+/*
+ * This function sweeps through an on-disk error log and stores all bookmarks
+ * as error bookmarks in a new ZAP object. At the end we discard the old one,
+ * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
+ * object.
+ */
+static void
+sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
+ dmu_tx_t *tx)
{
zap_cursor_t zc;
zap_attribute_t za;
zbookmark_phys_t zb;
+ uint64_t count;
- if (obj == 0)
- return (0);
+ *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+ DMU_OT_NONE, 0, tx);
+
+ /*
+ * If we cannnot perform the upgrade we should clear the old on-disk
+ * error logs.
+ */
+ if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+ return;
+ }
- for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
+ if (spa_upgrade_errlog_limit != 0 &&
+ zc.zc_cd == spa_upgrade_errlog_limit)
+ break;
- if (*count == 0) {
- zap_cursor_fini(&zc);
- return (SET_ERROR(ENOMEM));
+ name_to_bookmark(za.za_name, &zb);
+
+ zbookmark_err_phys_t zep;
+ zep.zb_object = zb.zb_object;
+ zep.zb_level = zb.zb_level;
+ zep.zb_blkid = zb.zb_blkid;
+ zep.zb_birth = 0;
+
+ /*
+ * In case of an error we should simply continue instead of
+ * returning prematurely. See the next comment.
+ */
+ uint64_t head_ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_dataset_t *ds;
+ objset_t *os;
+
+ int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+ if (error != 0)
+ continue;
+
+ head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+
+ /*
+ * The objset and the dnode are required for getting the block
+ * pointer, which is used to determine if BP_IS_HOLE(). If
+ * getting the objset or the dnode fails, do not create a
+ * zap entry (presuming we know the dataset) as this may create
+ * spurious errors that we cannot ever resolve. If an error is
+ * truly persistent, it should re-appear after a scan.
+ */
+ if (dmu_objset_from_ds(ds, &os) != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ continue;
}
- name_to_bookmark(za.za_name, &zb);
+ dnode_t *dn;
+ blkptr_t bp;
- if (copyout(&zb, (char *)addr +
- (*count - 1) * sizeof (zbookmark_phys_t),
- sizeof (zbookmark_phys_t)) != 0) {
- zap_cursor_fini(&zc);
- return (SET_ERROR(EFAULT));
+ if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ continue;
}
- *count -= 1;
- }
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
+ NULL, NULL);
+ if (error == EACCES)
+ error = 0;
+ else if (!error)
+ zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
+
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+ if (error != 0 || BP_IS_HOLE(&bp))
+ continue;
+
+ uint64_t err_obj;
+ error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
+ head_ds, &err_obj);
+
+ if (error == ENOENT) {
+ err_obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset,
+ *newobj, head_ds, err_obj, tx);
+ }
+ char buf[64];
+ errphys_to_name(&zep, buf, sizeof (buf));
+
+ const char *name = "";
+ (void) zap_update(spa->spa_meta_objset, err_obj,
+ buf, 1, strlen(name) + 1, name, tx);
+ }
zap_cursor_fini(&zc);
- return (0);
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
+}
+
+void
+spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
+{
+ uint64_t newobj = 0;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_last != 0) {
+ sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
+ spa->spa_errlog_last = newobj;
+
+ (void) zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+ sizeof (uint64_t), 1, &spa->spa_errlog_last, tx);
+ }
+
+ if (spa->spa_errlog_scrub != 0) {
+ sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
+ spa->spa_errlog_scrub = newobj;
+
+ (void) zap_update(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+ sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx);
+ }
+
+ mutex_exit(&spa->spa_errlog_lock);
}
+#ifdef _KERNEL
+/*
+ * If an error block is shared by two datasets it will be counted twice.
+ */
static int
-process_error_list(avl_tree_t *list, void *addr, size_t *count)
+process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
{
- spa_error_entry_t *se;
+ if (obj == 0)
+ return (0);
- for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+
+ zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ if (*count == 0) {
+ zap_cursor_fini(zc);
+ kmem_free(zc, sizeof (*zc));
+ kmem_free(za, sizeof (*za));
+ return (SET_ERROR(ENOMEM));
+ }
+
+ zbookmark_phys_t zb;
+ name_to_bookmark(za->za_name, &zb);
+
+ int error = copyout_entry(&zb, uaddr, count);
+ if (error != 0) {
+ zap_cursor_fini(zc);
+ kmem_free(zc, sizeof (*zc));
+ kmem_free(za, sizeof (*za));
+ return (error);
+ }
+ }
+ zap_cursor_fini(zc);
+ kmem_free(zc, sizeof (*zc));
+ kmem_free(za, sizeof (*za));
+ return (0);
+ }
- if (*count == 0)
- return (SET_ERROR(ENOMEM));
+ for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+
+ zap_cursor_t *head_ds_cursor;
+ zap_attribute_t *head_ds_attr;
+
+ head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+ head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ uint64_t head_ds_err_obj = za->za_first_integer;
+ uint64_t head_ds;
+ name_to_object(za->za_name, &head_ds);
+ for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
+ head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
+ head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
+
+ zbookmark_err_phys_t head_ds_block;
+ name_to_errphys(head_ds_attr->za_name, &head_ds_block);
+ int error = process_error_block(spa, head_ds,
+ &head_ds_block, uaddr, count);
+
+ if (error != 0) {
+ zap_cursor_fini(head_ds_cursor);
+ kmem_free(head_ds_cursor,
+ sizeof (*head_ds_cursor));
+ kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+
+ zap_cursor_fini(zc);
+ kmem_free(za, sizeof (*za));
+ kmem_free(zc, sizeof (*zc));
+ return (error);
+ }
+ }
+ zap_cursor_fini(head_ds_cursor);
+ kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
+ kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+ }
+ zap_cursor_fini(zc);
+ kmem_free(za, sizeof (*za));
+ kmem_free(zc, sizeof (*zc));
+ return (0);
+}
- if (copyout(&se->se_bookmark, (char *)addr +
- (*count - 1) * sizeof (zbookmark_phys_t),
- sizeof (zbookmark_phys_t)) != 0)
- return (SET_ERROR(EFAULT));
+static int
+process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
+{
+ spa_error_entry_t *se;
- *count -= 1;
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ for (se = avl_first(list); se != NULL;
+ se = AVL_NEXT(list, se)) {
+ int error =
+ copyout_entry(&se->se_bookmark, uaddr, count);
+ if (error != 0) {
+ return (error);
+ }
+ }
+ return (0);
}
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+ uint64_t head_ds = 0;
+ int error = get_head_ds(spa, se->se_bookmark.zb_objset,
+ &head_ds);
+
+ /*
+ * If get_head_ds() errors out, set the head filesystem
+ * to the filesystem stored in the bookmark of the
+ * error block.
+ */
+ if (error != 0)
+ head_ds = se->se_bookmark.zb_objset;
+
+ error = process_error_block(spa, head_ds,
+ &se->se_zep, uaddr, count);
+ if (error != 0)
+ return (error);
+ }
return (0);
}
#endif
@@ -229,11 +1079,18 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count)
* the error list lock when we are finished.
*/
int
-spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
{
int ret = 0;
#ifdef _KERNEL
+ /*
+ * The pool config lock is needed to hold a dataset_t via (among other
+ * places) process_error_list() -> process_error_block()->
+ * find_top_affected_fs(), and lock ordering requires that we get it
+ * before the spa_errlog_lock.
+ */
+ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
mutex_enter(&spa->spa_errlog_lock);
ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
@@ -244,14 +1101,17 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
mutex_enter(&spa->spa_errlist_lock);
if (!ret)
- ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
count);
if (!ret)
- ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
count);
mutex_exit(&spa->spa_errlist_lock);
mutex_exit(&spa->spa_errlog_lock);
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+#else
+ (void) spa, (void) uaddr, (void) count;
#endif
return (ret);
@@ -297,35 +1157,89 @@ spa_errlog_drain(spa_t *spa)
/*
* Process a list of errors into the current on-disk log.
*/
-static void
+void
sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
{
spa_error_entry_t *se;
- char buf[64];
+ char buf[NAME_MAX_LEN];
void *cookie;
- if (avl_numnodes(t) != 0) {
- /* create log if necessary */
- if (*obj == 0)
- *obj = zap_create(spa->spa_meta_objset,
- DMU_OT_ERROR_LOG, DMU_OT_NONE,
- 0, tx);
+ if (avl_numnodes(t) == 0)
+ return;
+
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
+ DMU_OT_NONE, 0, tx);
- /* add errors to the current log */
+ /* add errors to the current log */
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
- char *name = se->se_name ? se->se_name : "";
-
bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+ const char *name = se->se_name ? se->se_name : "";
+ (void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
+ strlen(name) + 1, name, tx);
+ }
+ } else {
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ zbookmark_err_phys_t zep;
+ zep.zb_object = se->se_zep.zb_object;
+ zep.zb_level = se->se_zep.zb_level;
+ zep.zb_blkid = se->se_zep.zb_blkid;
+ zep.zb_birth = se->se_zep.zb_birth;
+
+ uint64_t head_ds = 0;
+ int error = get_head_ds(spa, se->se_bookmark.zb_objset,
+ &head_ds);
+
+ /*
+ * If get_head_ds() errors out, set the head filesystem
+ * to the filesystem stored in the bookmark of the
+ * error block.
+ */
+ if (error != 0)
+ head_ds = se->se_bookmark.zb_objset;
+
+ uint64_t err_obj;
+ error = zap_lookup_int_key(spa->spa_meta_objset,
+ *obj, head_ds, &err_obj);
+
+ if (error == ENOENT) {
+ err_obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset,
+ *obj, head_ds, err_obj, tx);
+ }
+ errphys_to_name(&zep, buf, sizeof (buf));
+
+ const char *name = se->se_name ? se->se_name : "";
(void) zap_update(spa->spa_meta_objset,
- *obj, buf, 1, strlen(name) + 1, name, tx);
+ err_obj, buf, 1, strlen(name) + 1, name, tx);
}
+ }
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+}
- /* purge the error list */
- cookie = NULL;
- while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(se, sizeof (spa_error_entry_t));
+static void
+delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ VERIFY0(dmu_object_free(spa->spa_meta_objset,
+ za.za_first_integer, tx));
+ }
+ zap_cursor_fini(&zc);
}
+ VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
}
/*
@@ -352,6 +1266,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
*/
if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ avl_numnodes(&spa->spa_errlist_healed) == 0 &&
!spa->spa_scrub_finished) {
mutex_exit(&spa->spa_errlist_lock);
return;
@@ -362,11 +1277,23 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
spa->spa_scrub_finished = B_FALSE;
mutex_exit(&spa->spa_errlist_lock);
+
+ /*
+ * The pool config lock is needed to hold a dataset_t via
+ * sync_error_list() -> get_head_ds(), and lock ordering
+ * requires that we get it before the spa_errlog_lock.
+ */
+ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
mutex_enter(&spa->spa_errlog_lock);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
/*
+ * Remove healed errors from errors.
+ */
+ spa_remove_healed_errors(spa, &last, &scrub, tx);
+
+ /*
* Sync out the current list of errors.
*/
sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
@@ -376,8 +1303,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
*/
if (scrub_finished) {
if (spa->spa_errlog_last != 0)
- VERIFY(dmu_object_free(spa->spa_meta_objset,
- spa->spa_errlog_last, tx) == 0);
+ delete_errlog(spa, spa->spa_errlog_last, tx);
spa->spa_errlog_last = spa->spa_errlog_scrub;
spa->spa_errlog_scrub = 0;
@@ -402,15 +1328,163 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
dmu_tx_commit(tx);
mutex_exit(&spa->spa_errlog_lock);
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+}
+
+static void
+delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
+ dmu_tx_t *tx)
+{
+ if (spa_err_obj == 0)
+ return;
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+ uint64_t head_ds;
+ name_to_object(za.za_name, &head_ds);
+ if (head_ds == ds) {
+ (void) zap_remove(spa->spa_meta_objset, spa_err_obj,
+ za.za_name, tx);
+ VERIFY0(dmu_object_free(spa->spa_meta_objset,
+ za.za_first_integer, tx));
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+void
+spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
+{
+ mutex_enter(&spa->spa_errlog_lock);
+ delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
+ delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
+ mutex_exit(&spa->spa_errlog_lock);
+}
+
+static int
+find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
+ uint64_t *txg)
+{
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ int error = dsl_dataset_hold_obj_flags(dp, old_head,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+ while (prev_obj != 0) {
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj,
+ DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 &&
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
+ break;
+
+ if (error != 0)
+ return (error);
+
+ prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ }
+ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+ ASSERT(prev_obj != 0);
+ *txg = prev_obj_txg;
+ return (0);
+}
+
+static void
+swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
+ old_head, dmu_tx_t *tx)
+{
+ if (spa_err_obj == 0)
+ return;
+
+ uint64_t old_head_errlog;
+ int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
+ old_head, &old_head_errlog);
+
+ /* If no error log, then there is nothing to do. */
+ if (error != 0)
+ return;
+
+ uint64_t txg;
+ error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
+ if (error != 0)
+ return;
+
+ /*
+ * Create an error log if the file system being promoted does not
+ * already have one.
+ */
+ uint64_t new_head_errlog;
+ error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
+ &new_head_errlog);
+
+ if (error != 0) {
+ new_head_errlog = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
+
+ (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
+ new_head, new_head_errlog, tx);
+ }
+
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_err_phys_t err_block;
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+
+ const char *name = "";
+ name_to_errphys(za.za_name, &err_block);
+ if (err_block.zb_birth < txg) {
+ (void) zap_update(spa->spa_meta_objset, new_head_errlog,
+ za.za_name, 1, strlen(name) + 1, name, tx);
+
+ (void) zap_remove(spa->spa_meta_objset, old_head_errlog,
+ za.za_name, tx);
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+void
+spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
+ dmu_tx_t *tx)
+{
+ mutex_enter(&spa->spa_errlog_lock);
+ swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
+ swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
+ mutex_exit(&spa->spa_errlog_lock);
}
#if defined(_KERNEL)
/* error handling */
EXPORT_SYMBOL(spa_log_error);
-EXPORT_SYMBOL(spa_get_errlog_size);
+EXPORT_SYMBOL(spa_approx_errlog_size);
+EXPORT_SYMBOL(spa_get_last_errlog_size);
EXPORT_SYMBOL(spa_get_errlog);
EXPORT_SYMBOL(spa_errlog_rotate);
EXPORT_SYMBOL(spa_errlog_drain);
EXPORT_SYMBOL(spa_errlog_sync);
EXPORT_SYMBOL(spa_get_errlists);
+EXPORT_SYMBOL(spa_delete_dataset_errlog);
+EXPORT_SYMBOL(spa_swap_errlog);
+EXPORT_SYMBOL(sync_error_list);
+EXPORT_SYMBOL(spa_upgrade_errlog);
+EXPORT_SYMBOL(find_top_affected_fs);
+EXPORT_SYMBOL(find_birth_txg);
+EXPORT_SYMBOL(zep_to_zb);
+EXPORT_SYMBOL(name_to_errphys);
#endif
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW,
+ "Limit the number of errors which will be upgraded to the new "
+ "on-disk error log when enabling head_errlog");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c
index dae06e46c316..de036d6c3718 100644
--- a/sys/contrib/openzfs/module/zfs/spa_history.c
+++ b/sys/contrib/openzfs/module/zfs/spa_history.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -199,7 +199,7 @@ spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
{
nvlist_t *hist_nvl = fnvlist_alloc();
uint64_t uint64;
- char *string;
+ const char *string;
if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
@@ -248,7 +248,6 @@ spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
/*
* Write out a history event.
*/
-/*ARGSUSED*/
static void
spa_history_log_sync(void *arg, dmu_tx_t *tx)
{
diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
index 6fd302b8df34..32158e8c592c 100644
--- a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
+++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -177,7 +177,7 @@
* block size as we expect to be writing a lot of data to them at
* once.
*/
-unsigned long zfs_log_sm_blksz = 1ULL << 17;
+static const unsigned long zfs_log_sm_blksz = 1ULL << 17;
/*
* Percentage of the overall system's memory that ZFS allows to be
@@ -188,13 +188,13 @@ unsigned long zfs_log_sm_blksz = 1ULL << 17;
* (thus the _ppm suffix; reads as "parts per million"). As an example,
* the default of 1000 allows 0.1% of memory to be used.
*/
-unsigned long zfs_unflushed_max_mem_ppm = 1000;
+static uint64_t zfs_unflushed_max_mem_ppm = 1000;
/*
* Specific hard-limit in memory that ZFS allows to be used for
* unflushed changes.
*/
-unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
+static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30;
/*
* The following tunable determines the number of blocks that can be used for
@@ -243,28 +243,33 @@ unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
* provide upper and lower bounds for the log block limit.
* [see zfs_unflushed_log_block_{min,max}]
*/
-unsigned long zfs_unflushed_log_block_pct = 400;
+static uint_t zfs_unflushed_log_block_pct = 400;
/*
* If the number of metaslabs is small and our incoming rate is high, we could
* get into a situation that we are flushing all our metaslabs every TXG. Thus
* we always allow at least this many log blocks.
*/
-unsigned long zfs_unflushed_log_block_min = 1000;
+static uint64_t zfs_unflushed_log_block_min = 1000;
/*
* If the log becomes too big, the import time of the pool can take a hit in
* terms of performance. Thus we have a hard limit in the size of the log in
* terms of blocks.
*/
-unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+static uint64_t zfs_unflushed_log_block_max = (1ULL << 17);
+
+/*
+ * Also we have a hard limit in the size of the log in terms of dirty TXGs.
+ */
+static uint64_t zfs_unflushed_log_txg_max = 1000;
/*
* Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
* stability of the flushing algorithm (longer summary) vs its runtime overhead
* (smaller summary is faster to traverse).
*/
-unsigned long zfs_max_logsm_summary_length = 10;
+static uint64_t zfs_max_logsm_summary_length = 10;
/*
* Tunable that sets the lower bound on the metaslabs to flush every TXG.
@@ -277,7 +282,7 @@ unsigned long zfs_max_logsm_summary_length = 10;
* The point of this tunable is to be used in extreme cases where we really
* want to flush more metaslabs than our adaptable heuristic plans to flush.
*/
-unsigned long zfs_min_metaslabs_to_flush = 1;
+static uint64_t zfs_min_metaslabs_to_flush = 1;
/*
* Tunable that specifies how far in the past do we want to look when trying to
@@ -288,7 +293,7 @@ unsigned long zfs_min_metaslabs_to_flush = 1;
* average over all the blocks that we walk
* [see spa_estimate_incoming_log_blocks].
*/
-unsigned long zfs_max_log_walking = 5;
+static uint64_t zfs_max_log_walking = 5;
/*
* This tunable exists solely for testing purposes. It ensures that the log
@@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa)
return;
}
- uint64_t calculated_limit =
- (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
- spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+ uint64_t msdcount = 0;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e))
+ msdcount += e->lse_msdcount;
+
+ uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
+ spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
}
@@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa)
}
static boolean_t
-summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
{
+ if (e->lse_end == txg)
+ return (0);
+ if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
+ zfs_max_logsm_summary_length))
+ return (1);
uint64_t blocks_per_row = MAX(1,
DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
zfs_max_logsm_summary_length));
@@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
* the metaslab.
*/
void
-spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
{
/*
* We don't track summary data for read-only pools and this function
@@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
}
target->lse_mscount--;
+ if (dirty)
+ target->lse_msdcount--;
}
/*
@@ -490,15 +506,12 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
void
spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
{
- for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
- e != NULL; e = list_head(&spa->spa_log_summary)) {
+ log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ ASSERT3P(e, !=, NULL);
+ if (e->lse_txgcount > 0)
+ e->lse_txgcount--;
+ for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
if (e->lse_blkcount > blocks_gone) {
- /*
- * Assert that we stopped at an entry that is not
- * obsolete.
- */
- ASSERT(e->lse_mscount != 0);
-
e->lse_blkcount -= blocks_gone;
blocks_gone = 0;
break;
@@ -560,31 +573,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa)
static void
summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
- uint64_t nblocks)
+ uint64_t metaslabs_dirty, uint64_t nblocks)
{
log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
- if (e == NULL || summary_entry_is_full(spa, e)) {
+ if (e == NULL || summary_entry_is_full(spa, e, txg)) {
e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
- e->lse_start = txg;
+ e->lse_start = e->lse_end = txg;
+ e->lse_txgcount = 1;
list_insert_tail(&spa->spa_log_summary, e);
}
ASSERT3U(e->lse_start, <=, txg);
+ if (e->lse_end < txg) {
+ e->lse_end = txg;
+ e->lse_txgcount++;
+ }
e->lse_mscount += metaslabs_flushed;
+ e->lse_msdcount += metaslabs_dirty;
e->lse_blkcount += nblocks;
}
static void
spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
{
- summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+ summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
+}
+
+void
+spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
+{
+ summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
}
void
-spa_log_summary_add_flushed_metaslab(spa_t *spa)
+spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
{
- summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+ log_summary_entry_t *target = NULL;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+ if (e->lse_start > txg)
+ break;
+ target = e;
+ }
+ ASSERT3P(target, !=, NULL);
+ ASSERT3U(target->lse_mscount, !=, 0);
+ target->lse_msdcount++;
}
/*
@@ -630,6 +664,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
int64_t available_blocks =
spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
+ int64_t available_txgs = zfs_unflushed_log_txg_max;
+ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+ e; e = list_next(&spa->spa_log_summary, e))
+ available_txgs -= e->lse_txgcount;
+
/*
* This variable tells us the total number of flushes needed to
* keep the log size within the limit when we reach txgs_in_future.
@@ -637,9 +676,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
uint64_t total_flushes = 0;
/* Holds the current maximum of our estimates so far. */
- uint64_t max_flushes_pertxg =
- MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
- zfs_min_metaslabs_to_flush);
+ uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
/*
* For our estimations we only look as far in the future
@@ -653,11 +690,15 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
* then keep skipping TXGs accumulating more blocks
* based on the incoming rate until we exceed it.
*/
- if (available_blocks >= 0) {
- uint64_t skip_txgs = (available_blocks / incoming) + 1;
+ if (available_blocks >= 0 && available_txgs >= 0) {
+ uint64_t skip_txgs = (incoming == 0) ?
+ available_txgs + 1 : MIN(available_txgs + 1,
+ (available_blocks / incoming) + 1);
available_blocks -= (skip_txgs * incoming);
+ available_txgs -= skip_txgs;
txgs_in_future += skip_txgs;
ASSERT3S(available_blocks, >=, -incoming);
+ ASSERT3S(available_txgs, >=, -1);
}
/*
@@ -666,9 +707,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
* based on the current entry in the summary, updating
* our available_blocks.
*/
- ASSERT3S(available_blocks, <, 0);
+ ASSERT(available_blocks < 0 || available_txgs < 0);
available_blocks += e->lse_blkcount;
- total_flushes += e->lse_mscount;
+ available_txgs += e->lse_txgcount;
+ total_flushes += e->lse_msdcount;
/*
* Keep the running maximum of the total_flushes that
@@ -680,8 +722,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa)
*/
max_flushes_pertxg = MAX(max_flushes_pertxg,
DIV_ROUND_UP(total_flushes, txgs_in_future));
- ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
- max_flushes_pertxg);
}
return (max_flushes_pertxg);
}
@@ -743,7 +783,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
* request of flushing everything before we attempt to return
* immediately.
*/
- if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
!dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
!spa_flush_all_logs_requested(spa))
return;
@@ -771,14 +811,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
uint64_t want_to_flush;
if (spa_flush_all_logs_requested(spa)) {
ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
- want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+ want_to_flush = UINT64_MAX;
} else {
want_to_flush = spa_estimate_metaslabs_to_flush(spa);
}
- ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
- want_to_flush);
-
/* Used purely for verification purposes */
uint64_t visited = 0;
@@ -809,31 +846,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
break;
- mutex_enter(&curr->ms_sync_lock);
- mutex_enter(&curr->ms_lock);
- boolean_t flushed = metaslab_flush(curr, tx);
- mutex_exit(&curr->ms_lock);
- mutex_exit(&curr->ms_sync_lock);
-
- /*
- * If we failed to flush a metaslab (because it was loading),
- * then we are done with the block heuristic as it's not
- * possible to destroy any log space maps once you've skipped
- * a metaslab. In that case we just set our counter to 0 but
- * we continue looping in case there is still memory pressure
- * due to unflushed changes. Note that, flushing a metaslab
- * that is not the oldest flushed in the pool, will never
- * destroy any log space maps [see spa_cleanup_old_sm_logs()].
- */
- if (!flushed) {
- want_to_flush = 0;
- } else if (want_to_flush > 0) {
- want_to_flush--;
- }
+ if (metaslab_unflushed_dirty(curr)) {
+ mutex_enter(&curr->ms_sync_lock);
+ mutex_enter(&curr->ms_lock);
+ metaslab_flush(curr, tx);
+ mutex_exit(&curr->ms_lock);
+ mutex_exit(&curr->ms_sync_lock);
+ if (want_to_flush > 0)
+ want_to_flush--;
+ } else
+ metaslab_unflushed_bump(curr, tx, B_FALSE);
visited++;
}
ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+
+ spa_log_sm_set_blocklimit(spa);
}
/*
@@ -904,6 +932,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
avl_remove(&spa->spa_sm_logs_by_txg, sls);
space_map_free_obj(mos, sls->sls_sm_obj, tx);
VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+ spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
kmem_free(sls, sizeof (spa_log_sm_t));
}
@@ -963,12 +992,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
0, UINT64_MAX, SPA_MINBLOCKSHIFT));
- /*
- * If the log space map feature was just enabled, the blocklimit
- * has not yet been set.
- */
- if (spa_log_sm_blocklimit(spa) == 0)
- spa_log_sm_set_blocklimit(spa);
+ spa_log_sm_set_blocklimit(spa);
}
/*
@@ -1094,12 +1118,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
panic("invalid maptype_t");
break;
}
+ if (!metaslab_unflushed_dirty(ms)) {
+ metaslab_set_unflushed_dirty(ms, B_TRUE);
+ spa_log_summary_dirty_flushed_metaslab(spa,
+ metaslab_unflushed_txg(ms));
+ }
return (0);
}
static int
spa_ld_log_sm_data(spa_t *spa)
{
+ spa_log_sm_t *sls, *psls;
int error = 0;
/*
@@ -1113,50 +1143,98 @@ spa_ld_log_sm_data(spa_t *spa)
ASSERT0(spa->spa_unflushed_stats.sus_memused);
hrtime_t read_logs_starttime = gethrtime();
- /* this is a no-op when we don't have space map logs */
- for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
- sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
- space_map_t *sm = NULL;
- error = space_map_open(&sm, spa_meta_objset(spa),
- sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
- if (error != 0) {
- spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
- "space_map_open(obj=%llu) [error %d]",
- (u_longlong_t)sls->sls_sm_obj, error);
- goto out;
+
+ /* Prefetch log spacemaps dnodes. */
+ for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
+ sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ uint_t pn = 0;
+ uint64_t ps = 0;
+ uint64_t nsm = 0;
+ psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
+ while (sls != NULL) {
+ /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
+ if (psls != NULL && pn < 16 &&
+ (pn < 2 || ps < 2 * dmu_prefetch_max)) {
+ error = space_map_open(&psls->sls_sm,
+ spa_meta_objset(spa), psls->sls_sm_obj, 0,
+ UINT64_MAX, SPA_MINBLOCKSHIFT);
+ if (error != 0) {
+ spa_load_failed(spa, "spa_ld_log_sm_data(): "
+ "failed at space_map_open(obj=%llu) "
+ "[error %d]",
+ (u_longlong_t)sls->sls_sm_obj, error);
+ goto out;
+ }
+ dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
+ 0, 0, space_map_length(psls->sls_sm),
+ ZIO_PRIORITY_ASYNC_READ);
+ pn++;
+ ps += space_map_length(psls->sls_sm);
+ psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
+ continue;
}
+ /* Load TXG log spacemap into ms_unflushed_allocs/frees. */
+ kpreempt(KPREEMPT_SYNC);
+ ASSERT0(sls->sls_nblocks);
+ sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
+ spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+ summary_add_data(spa, sls->sls_txg,
+ sls->sls_mscount, 0, sls->sls_nblocks);
+
+ spa_import_progress_set_notes_nolog(spa,
+ "Read %llu of %lu log space maps", (u_longlong_t)nsm,
+ avl_numnodes(&spa->spa_sm_logs_by_txg));
+
struct spa_ld_log_sm_arg vla = {
.slls_spa = spa,
.slls_txg = sls->sls_txg
};
- error = space_map_iterate(sm, space_map_length(sm),
- spa_ld_log_sm_cb, &vla);
+ error = space_map_iterate(sls->sls_sm,
+ space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
if (error != 0) {
- space_map_close(sm);
spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
"at space_map_iterate(obj=%llu) [error %d]",
(u_longlong_t)sls->sls_sm_obj, error);
goto out;
}
- ASSERT0(sls->sls_nblocks);
- sls->sls_nblocks = space_map_nblocks(sm);
- spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
- summary_add_data(spa, sls->sls_txg,
- sls->sls_mscount, sls->sls_nblocks);
+ pn--;
+ ps -= space_map_length(sls->sls_sm);
+ nsm++;
+ space_map_close(sls->sls_sm);
+ sls->sls_sm = NULL;
+ sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
- space_map_close(sm);
+ /* Update log block limits considering just loaded. */
+ spa_log_sm_set_blocklimit(spa);
}
+
hrtime_t read_logs_endtime = gethrtime();
spa_load_note(spa,
- "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
- "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
+ "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
+ "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
(u_longlong_t)spa_log_sm_nblocks(spa),
(u_longlong_t)zfs_log_sm_blksz,
- (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
+ (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
out:
+ if (error != 0) {
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ if (sls->sls_sm) {
+ space_map_close(sls->sls_sm);
+ sls->sls_sm = NULL;
+ }
+ }
+ } else {
+ ASSERT0(pn);
+ ASSERT0(ps);
+ }
/*
* Now that the metaslabs contain their unflushed changes:
* [1] recalculate their actual allocated space
@@ -1237,6 +1315,9 @@ spa_ld_unflushed_txgs(vdev_t *vd)
}
ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+ ms->ms_unflushed_dirty = B_FALSE;
+ ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
+ ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
if (ms->ms_unflushed_txg != 0) {
mutex_enter(&spa->spa_flushed_ms_lock);
avl_add(&spa->spa_metaslabs_by_flushed, ms);
@@ -1283,40 +1364,44 @@ spa_ld_log_spacemaps(spa_t *spa)
}
/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
- "Specific hard-limit in memory that ZFS allows to be used for "
- "unflushed changes");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW,
+ "Specific hard-limit in memory that ZFS allows to be used for "
+ "unflushed changes");
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
- "Percentage of the overall system memory that ZFS allows to be "
- "used for unflushed changes (value is calculated over 1000000 for "
- "finer granularity)");
-
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
- "Hard limit (upper-bound) in the size of the space map log "
- "in terms of blocks.");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW,
+ "Percentage of the overall system memory that ZFS allows to be "
+ "used for unflushed changes (value is calculated over 1000000 for "
+ "finer granularity)");
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
- "Lower-bound limit for the maximum amount of blocks allowed in "
- "log spacemap (see zfs_unflushed_log_block_max)");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW,
+ "Hard limit (upper-bound) in the size of the space map log "
+ "in terms of blocks.");
-ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
- "Tunable used to determine the number of blocks that can be used for "
- "the spacemap log, expressed as a percentage of the total number of "
- "metaslabs in the pool (e.g. 400 means the number of log blocks is "
- "capped at 4 times the number of metaslabs)");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW,
+ "Lower-bound limit for the maximum amount of blocks allowed in "
+ "log spacemap (see zfs_unflushed_log_block_max)");
-ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
- "The number of past TXGs that the flushing algorithm of the log "
- "spacemap feature uses to estimate incoming log blocks");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW,
+ "Hard limit (upper-bound) in the size of the space map log "
+ "in terms of dirty TXGs.");
-ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
- "Maximum number of rows allowed in the summary of the spacemap log");
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW,
+ "Tunable used to determine the number of blocks that can be used for "
+ "the spacemap log, expressed as a percentage of the total number of "
+ "metaslabs in the pool (e.g. 400 means the number of log blocks is "
+ "capped at 4 times the number of metaslabs)");
-ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
- "Minimum number of metaslabs to flush per dirty TXG");
+ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW,
+ "The number of past TXGs that the flushing algorithm of the log "
+ "spacemap feature uses to estimate incoming log blocks");
ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
- "Prevent the log spacemaps from being flushed and destroyed "
- "during pool export/destroy");
+ "Prevent the log spacemaps from being flushed and destroyed "
+ "during pool export/destroy");
/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW,
+ "Maximum number of rows allowed in the summary of the spacemap log");
+
+ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW,
+ "Minimum number of metaslabs to flush per dirty TXG");
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 1ecd2294dba0..d1d41bbe7214 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,16 +20,18 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
@@ -56,6 +58,7 @@
#include <sys/fs/zfs.h>
#include <sys/metaslab_impl.h>
#include <sys/arc.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/kstat.h>
#include "zfs_prop.h"
@@ -79,7 +82,8 @@
* - Check if spa_refcount is zero
* - Rename a spa_t
* - add/remove/attach/detach devices
- * - Held for the duration of create/destroy/import/export
+ * - Held for the duration of create/destroy
+ * - Held at the start and end of import and export
*
* It does not need to handle recursion. A create or destroy may
* reference objects (files or zvols) in other pools, but by
@@ -232,17 +236,16 @@
* locking is, always, based on spa_namespace_lock and spa_config_lock[].
*/
-static avl_tree_t spa_namespace_avl;
+avl_tree_t spa_namespace_avl;
kmutex_t spa_namespace_lock;
-static kcondvar_t spa_namespace_cv;
-int spa_max_replication_override = SPA_DVAS_PER_BP;
+kcondvar_t spa_namespace_cv;
+static const int spa_max_replication_override = SPA_DVAS_PER_BP;
static kmutex_t spa_spare_lock;
static avl_tree_t spa_spare_avl;
static kmutex_t spa_l2cache_lock;
static avl_tree_t spa_l2cache_avl;
-kmem_cache_t *spa_buffer_pool;
spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
#ifdef ZFS_DEBUG
@@ -304,25 +307,25 @@ int zfs_free_leak_on_eio = B_FALSE;
* has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
* in one of three behaviors controlled by zfs_deadman_failmode.
*/
-unsigned long zfs_deadman_synctime_ms = 600000UL;
+uint64_t zfs_deadman_synctime_ms = 600000UL; /* 10 min. */
/*
* This value controls the maximum amount of time zio_wait() will block for an
* outstanding IO. By default this is 300 seconds at which point the "hung"
* behavior will be applied as described for zfs_deadman_synctime_ms.
*/
-unsigned long zfs_deadman_ziotime_ms = 300000UL;
+uint64_t zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */
/*
* Check time in milliseconds. This defines the frequency at which we check
* for hung I/O.
*/
-unsigned long zfs_deadman_checktime_ms = 60000UL;
+uint64_t zfs_deadman_checktime_ms = 60000UL; /* 1 min. */
/*
* By default the deadman is enabled.
*/
-int zfs_deadman_enabled = 1;
+int zfs_deadman_enabled = B_TRUE;
/*
* Controls the behavior of the deadman when it detects a "hung" I/O.
@@ -332,7 +335,7 @@ int zfs_deadman_enabled = 1;
* continue - Attempt to recover from a "hung" I/O
* panic - Panic the system
*/
-char *zfs_deadman_failmode = "wait";
+const char *zfs_deadman_failmode = "wait";
/*
* The worst case is single-sector max-parity RAID-Z blocks, in which
@@ -343,7 +346,7 @@ char *zfs_deadman_failmode = "wait";
* the worst case is:
* (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
*/
-int spa_asize_inflation = 24;
+uint_t spa_asize_inflation = 24;
/*
* Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
@@ -383,11 +386,21 @@ int spa_asize_inflation = 24;
*
* See also the comments in zfs_space_check_t.
*/
-int spa_slop_shift = 5;
-uint64_t spa_min_slop = 128ULL * 1024 * 1024;
-uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
-int spa_allocators = 4;
+uint_t spa_slop_shift = 5;
+static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
+static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
+/*
+ * Number of allocators to use, per spa instance
+ */
+static int spa_num_allocators = 4;
+static int spa_cpus_per_allocator = 4;
+
+/*
+ * Spa active allocator.
+ * Valid values are zfs_active_allocator=<dynamic|cursor|new-dynamic>.
+ */
+const char *zfs_active_allocator = "dynamic";
void
spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -415,20 +428,22 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
spa->spa_trust_config ? "trusted" : "untrusted", buf);
+
+ spa_import_progress_set_notes_nolog(spa, "%s", buf);
}
/*
* By default dedup and user data indirects land in the special class
*/
-int zfs_ddt_data_is_special = B_TRUE;
-int zfs_user_indirect_is_special = B_TRUE;
+static int zfs_ddt_data_is_special = B_TRUE;
+static int zfs_user_indirect_is_special = B_TRUE;
/*
* The percentage of special class final space reserved for metadata only.
* Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
* let metadata into the class.
*/
-int zfs_special_class_metadata_reserve_pct = 25;
+static uint_t zfs_special_class_metadata_reserve_pct = 25;
/*
* ==========================================================================
@@ -462,7 +477,7 @@ spa_config_lock_destroy(spa_t *spa)
}
int
-spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
+spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
{
for (int i = 0; i < SCL_LOCKS; i++) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
@@ -492,9 +507,11 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
return (1);
}
-void
-spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+static void
+spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
+ int mmp_flag)
{
+ (void) tag;
int wlocks_held = 0;
ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
@@ -507,7 +524,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
continue;
mutex_enter(&scl->scl_lock);
if (rw == RW_READER) {
- while (scl->scl_writer || scl->scl_write_wanted) {
+ while (scl->scl_writer ||
+ (!mmp_flag && scl->scl_write_wanted)) {
cv_wait(&scl->scl_cv, &scl->scl_lock);
}
} else {
@@ -526,8 +544,30 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
}
void
+spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+ spa_config_enter_impl(spa, locks, tag, rw, 0);
+}
+
+/*
+ * The spa_config_enter_mmp() allows the mmp thread to cut in front of
+ * outstanding write lock requests. This is needed since the mmp updates are
+ * time sensitive and failure to service them promptly will result in a
+ * suspended pool. This pool suspension has been seen in practice when there is
+ * a single disk in a pool that is responding slowly and presumably about to
+ * fail.
+ */
+
+void
+spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+ spa_config_enter_impl(spa, locks, tag, rw, 1);
+}
+
+void
spa_config_exit(spa_t *spa, int locks, const void *tag)
{
+ (void) tag;
for (int i = SCL_LOCKS - 1; i >= 0; i--) {
spa_config_lock_t *scl = &spa->spa_config_lock[i];
if (!(locks & (1 << i)))
@@ -581,6 +621,7 @@ spa_lookup(const char *name)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
+retry:
(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
/*
@@ -592,6 +633,20 @@ spa_lookup(const char *name)
*cp = '\0';
spa = avl_find(&spa_namespace_avl, &search, &where);
+ if (spa == NULL)
+ return (NULL);
+
+ /*
+ * Avoid racing with import/export, which don't hold the namespace
+ * lock for their entire duration.
+ */
+ if ((spa->spa_load_thread != NULL &&
+ spa->spa_load_thread != curthread) ||
+ (spa->spa_export_thread != NULL &&
+ spa->spa_export_thread != curthread)) {
+ cv_wait(&spa_namespace_cv, &spa_namespace_lock);
+ goto retry;
+ }
return (spa);
}
@@ -684,11 +739,13 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
spa_set_deadman_failmode(spa, zfs_deadman_failmode);
+ spa_set_allocator(spa, zfs_active_allocator);
zfs_refcount_create(&spa->spa_refcount);
spa_config_lock_init(spa);
spa_stats_init(spa);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
avl_add(&spa_namespace_avl, spa);
/*
@@ -697,15 +754,25 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
if (altroot)
spa->spa_root = spa_strdup(altroot);
- spa->spa_alloc_count = spa_allocators;
+ /* Do not allow more allocators than fraction of CPUs. */
+ spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
+ boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
+
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (spa_alloc_t), KM_SLEEP);
for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
- sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
+ if (spa->spa_alloc_count > 1) {
+ spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
+ sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
+ mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
+ NULL);
+ }
+
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
@@ -746,6 +813,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
spa->spa_min_alloc = INT_MAX;
+ spa->spa_gcd_alloc = INT_MAX;
/* Reset cached value */
spa->spa_dedup_dspace = ~0ULL;
@@ -783,13 +851,11 @@ spa_remove(spa_t *spa)
nvlist_free(spa->spa_config_splitting);
avl_remove(&spa_namespace_avl, spa);
- cv_broadcast(&spa_namespace_cv);
if (spa->spa_root)
spa_strfree(spa->spa_root);
- while ((dp = list_head(&spa->spa_config_list)) != NULL) {
- list_remove(&spa->spa_config_list, dp);
+ while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path != NULL)
spa_strfree(dp->scd_path);
kmem_free(dp, sizeof (spa_config_dirent_t));
@@ -801,6 +867,11 @@ spa_remove(spa_t *spa)
}
kmem_free(spa->spa_allocs, spa->spa_alloc_count *
sizeof (spa_alloc_t));
+ if (spa->spa_alloc_count > 1) {
+ mutex_destroy(&spa->spa_allocs_use->sau_lock);
+ kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
+ sau_inuse[spa->spa_alloc_count]));
+ }
avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg);
@@ -875,22 +946,25 @@ spa_next(spa_t *prev)
* have the namespace lock held.
*/
void
-spa_open_ref(spa_t *spa, void *tag)
+spa_open_ref(spa_t *spa, const void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
- MUTEX_HELD(&spa_namespace_lock));
+ MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_load_thread == curthread);
(void) zfs_refcount_add(&spa->spa_refcount, tag);
}
/*
* Remove a reference to the given spa_t. Must have at least one reference, or
- * have the namespace lock held.
+ * have the namespace lock held or be part of a pool import/export.
*/
void
-spa_close(spa_t *spa, void *tag)
+spa_close(spa_t *spa, const void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
- MUTEX_HELD(&spa_namespace_lock));
+ MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_load_thread == curthread ||
+ spa->spa_export_thread == curthread);
(void) zfs_refcount_remove(&spa->spa_refcount, tag);
}
@@ -903,20 +977,22 @@ spa_close(spa_t *spa, void *tag)
* so the asserts in spa_close() do not apply.
*/
void
-spa_async_close(spa_t *spa, void *tag)
+spa_async_close(spa_t *spa, const void *tag)
{
(void) zfs_refcount_remove(&spa->spa_refcount, tag);
}
/*
* Check to see if the spa refcount is zero. Must be called with
- * spa_namespace_lock held. We really compare against spa_minref, which is the
- * number of references acquired when opening a pool
+ * spa_namespace_lock held or be the spa export thread. We really
+ * compare against spa_minref, which is the number of references
+ * acquired when opening a pool
*/
boolean_t
spa_refcount_zero(spa_t *spa)
{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
}
@@ -1164,6 +1240,8 @@ spa_vdev_enter(spa_t *spa)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
+ ASSERT0(spa->spa_export_thread);
+
vdev_autotrim_stop_all(spa);
return (spa_vdev_config_enter(spa));
@@ -1181,6 +1259,8 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
+ ASSERT0(spa->spa_export_thread);
+
vdev_autotrim_stop_all(spa);
if (guid != 0) {
@@ -1213,7 +1293,8 @@ spa_vdev_config_enter(spa_t *spa)
* of multiple transactions without releasing the spa_namespace_lock.
*/
void
-spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
+ const char *tag)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
@@ -1287,7 +1368,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
* If the config changed, update the config cache.
*/
if (config_changed)
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
}
/*
@@ -1382,7 +1463,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
*/
if (config_changed) {
mutex_enter(&spa_namespace_lock);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
mutex_exit(&spa_namespace_lock);
}
@@ -1476,8 +1557,7 @@ spa_strdup(const char *s)
len = strlen(s);
new = kmem_alloc(len + 1, KM_SLEEP);
- bcopy(s, new, len);
- new[len] = '\0';
+ memcpy(new, s, len + 1);
return (new);
}
@@ -1512,8 +1592,8 @@ void
snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
{
char type[256];
- char *checksum = NULL;
- char *compress = NULL;
+ const char *checksum = NULL;
+ const char *compress = NULL;
if (bp != NULL) {
if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
@@ -1534,7 +1614,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
- SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+ SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum,
compress);
}
@@ -1652,10 +1732,10 @@ spa_altroot(spa_t *spa, char *buf, size_t buflen)
if (spa->spa_root == NULL)
buf[0] = '\0';
else
- (void) strncpy(buf, spa->spa_root, buflen);
+ (void) strlcpy(buf, spa->spa_root, buflen);
}
-int
+uint32_t
spa_sync_pass(spa_t *spa)
{
return (spa->spa_sync_pass);
@@ -1795,7 +1875,8 @@ spa_get_slop_space(spa_t *spa)
* deduplicated data, so since it's not useful to reserve more
* space with more deduplicated data, we subtract that out here.
*/
- space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
+ space =
+ spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
slop = MIN(space >> spa_slop_shift, spa_max_slop);
/*
@@ -1832,37 +1913,28 @@ void
spa_update_dspace(spa_t *spa)
{
spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
- ddt_get_dedup_dspace(spa);
- if (spa->spa_vdev_removal != NULL) {
+ ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
+ if (spa->spa_nonallocating_dspace > 0) {
/*
- * We can't allocate from the removing device, so subtract
- * its size if it was included in dspace (i.e. if this is a
- * normal-class vdev, not special/dedup). This prevents the
- * DMU/DSL from filling up the (now smaller) pool while we
- * are in the middle of removing the device.
+ * Subtract the space provided by all non-allocating vdevs that
+ * contribute to dspace. If a file is overwritten, its old
+ * blocks are freed and new blocks are allocated. If there are
+ * no snapshots of the file, the available space should remain
+ * the same. The old blocks could be freed from the
+ * non-allocating vdev, but the new blocks must be allocated on
+ * other (allocating) vdevs. By reserving the entire size of
+ * the non-allocating vdevs (including allocated space), we
+ * ensure that there will be enough space on the allocating
+ * vdevs for this file overwrite to succeed.
*
* Note that the DMU/DSL doesn't actually know or care
* how much space is allocated (it does its own tracking
* of how much space has been logically used). So it
* doesn't matter that the data we are moving may be
- * allocated twice (on the old device and the new
- * device).
- */
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- vdev_t *vd =
- vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
- /*
- * If the stars align, we can wind up here after
- * vdev_remove_complete() has cleared vd->vdev_mg but before
- * spa->spa_vdev_removal gets cleared, so we must check before
- * we dereference.
+ * allocated twice (on the old device and the new device).
*/
- if (vd->vdev_mg &&
- vd->vdev_mg->mg_class == spa_normal_class(spa)) {
- spa->spa_dspace -= spa_deflate(spa) ?
- vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
- }
- spa_config_exit(spa, SCL_VDEV, FTAG);
+ ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace);
+ spa->spa_dspace -= spa->spa_nonallocating_dspace;
}
}
@@ -2153,6 +2225,7 @@ typedef struct spa_import_progress {
uint64_t pool_guid; /* unique id for updates */
char *pool_name;
spa_load_state_t spa_load_state;
+ char *spa_load_notes;
uint64_t mmp_sec_remaining; /* MMP activity check */
uint64_t spa_load_max_txg; /* rewind txg */
procfs_list_node_t smh_node;
@@ -2163,9 +2236,9 @@ spa_history_list_t *spa_import_progress_list = NULL;
static int
spa_import_progress_show_header(struct seq_file *f)
{
- seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
+ seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid",
"load_state", "multihost_secs", "max_txg",
- "pool_name");
+ "pool_name", "notes");
return (0);
}
@@ -2174,11 +2247,12 @@ spa_import_progress_show(struct seq_file *f, void *data)
{
spa_import_progress_t *sip = (spa_import_progress_t *)data;
- seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
+ seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n",
(u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
(u_longlong_t)sip->mmp_sec_remaining,
(u_longlong_t)sip->spa_load_max_txg,
- (sip->pool_name ? sip->pool_name : "-"));
+ (sip->pool_name ? sip->pool_name : "-"),
+ (sip->spa_load_notes ? sip->spa_load_notes : "-"));
return (0);
}
@@ -2192,6 +2266,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
sip = list_remove_head(&shl->procfs_list.pl_list);
if (sip->pool_name)
spa_strfree(sip->pool_name);
+ if (sip->spa_load_notes)
+ kmem_strfree(sip->spa_load_notes);
kmem_free(sip, sizeof (spa_import_progress_t));
shl->size--;
}
@@ -2247,6 +2323,10 @@ spa_import_progress_set_state(uint64_t pool_guid,
sip = list_prev(&shl->procfs_list.pl_list, sip)) {
if (sip->pool_guid == pool_guid) {
sip->spa_load_state = load_state;
+ if (sip->spa_load_notes != NULL) {
+ kmem_strfree(sip->spa_load_notes);
+ sip->spa_load_notes = NULL;
+ }
error = 0;
break;
}
@@ -2256,6 +2336,59 @@ spa_import_progress_set_state(uint64_t pool_guid,
return (error);
}
+static void
+spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg,
+ const char *fmt, va_list adx)
+{
+ spa_history_list_t *shl = spa_import_progress_list;
+ spa_import_progress_t *sip;
+ uint64_t pool_guid = spa_guid(spa);
+
+ if (shl->size == 0)
+ return;
+
+ char *notes = kmem_vasprintf(fmt, adx);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+ sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+ if (sip->pool_guid == pool_guid) {
+ if (sip->spa_load_notes != NULL) {
+ kmem_strfree(sip->spa_load_notes);
+ sip->spa_load_notes = NULL;
+ }
+ sip->spa_load_notes = notes;
+ if (log_dbgmsg)
+ zfs_dbgmsg("'%s' %s", sip->pool_name, notes);
+ notes = NULL;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+ if (notes != NULL)
+ kmem_strfree(notes);
+}
+
+void
+spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx);
+ va_end(adx);
+}
+
+void
+spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...)
+{
+ va_list adx;
+
+ va_start(adx, fmt);
+ spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx);
+ va_end(adx);
+}
+
int
spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
{
@@ -2313,7 +2446,7 @@ spa_import_progress_add(spa_t *spa)
{
spa_history_list_t *shl = spa_import_progress_list;
spa_import_progress_t *sip;
- char *poolname = NULL;
+ const char *poolname = NULL;
sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
sip->pool_guid = spa_guid(spa);
@@ -2324,6 +2457,7 @@ spa_import_progress_add(spa_t *spa)
poolname = spa_name(spa);
sip->pool_name = spa_strdup(poolname);
sip->spa_load_state = spa_load_state(spa);
+ sip->spa_load_notes = NULL;
mutex_enter(&shl->procfs_list.pl_lock);
procfs_list_add(&shl->procfs_list, sip);
@@ -2343,6 +2477,8 @@ spa_import_progress_remove(uint64_t pool_guid)
if (sip->pool_guid == pool_guid) {
if (sip->pool_name)
spa_strfree(sip->pool_name);
+ if (sip->spa_load_notes)
+ spa_strfree(sip->spa_load_notes);
list_remove(&shl->procfs_list.pl_list, sip);
shl->size--;
kmem_free(sip, sizeof (spa_import_progress_t));
@@ -2417,18 +2553,20 @@ spa_init(spa_mode_t mode)
unique_init();
zfs_btree_init();
metaslab_stat_init();
+ brt_init();
ddt_init();
zio_init();
dmu_init();
zil_init();
- vdev_cache_stat_init();
vdev_mirror_stat_init();
vdev_raidz_math_init();
vdev_file_init();
zfs_prop_init();
+ chksum_init();
zpool_prop_init();
zpool_feature_init();
spa_config_load();
+ vdev_prop_init();
l2arc_start();
scan_init();
qat_init();
@@ -2443,13 +2581,14 @@ spa_fini(void)
spa_evict_all();
vdev_file_fini();
- vdev_cache_stat_fini();
vdev_mirror_stat_fini();
vdev_raidz_math_fini();
+ chksum_fini();
zil_fini();
dmu_fini();
zio_fini();
ddt_fini();
+ brt_fini();
metaslab_stat_fini();
zfs_btree_fini();
unique_fini();
@@ -2557,10 +2696,18 @@ spa_scan_stat_init(spa_t *spa)
spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
else
spa->spa_scan_pass_scrub_pause = 0;
+
+ if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan))
+ spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start;
+ else
+ spa->spa_scan_pass_errorscrub_pause = 0;
+
spa->spa_scan_pass_scrub_spent_paused = 0;
spa->spa_scan_pass_exam = 0;
spa->spa_scan_pass_issued = 0;
- vdev_scan_stat_init(spa->spa_root_vdev);
+
+ // error scrub stats
+ spa->spa_scan_pass_errorscrub_spent_paused = 0;
}
/*
@@ -2571,9 +2718,11 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
{
dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
- if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+ if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE &&
+ scn->errorscrub_phys.dep_func == POOL_SCAN_NONE))
return (SET_ERROR(ENOENT));
- bzero(ps, sizeof (pool_scan_stat_t));
+
+ memset(ps, 0, sizeof (pool_scan_stat_t));
/* data stored on disk */
ps->pss_func = scn->scn_phys.scn_func;
@@ -2582,7 +2731,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
ps->pss_end_time = scn->scn_phys.scn_end_time;
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
ps->pss_examined = scn->scn_phys.scn_examined;
- ps->pss_to_process = scn->scn_phys.scn_to_process;
+ ps->pss_skipped = scn->scn_phys.scn_skipped;
ps->pss_processed = scn->scn_phys.scn_processed;
ps->pss_errors = scn->scn_phys.scn_errors;
@@ -2595,6 +2744,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
ps->pss_issued =
scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
+ /* error scrub data stored on disk */
+ ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func;
+ ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state;
+ ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time;
+ ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time;
+ ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined;
+ ps->pss_error_scrub_to_be_examined =
+ scn->errorscrub_phys.dep_to_examine;
+
+ /* error scrub data not stored on disk */
+ ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause;
+
return (0);
}
@@ -2714,8 +2875,7 @@ spa_state_to_name(spa_t *spa)
vdev_state_t state = rvd->vdev_state;
vdev_aux_t aux = rvd->vdev_stat.vs_aux;
- if (spa_suspended(spa) &&
- (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
+ if (spa_suspended(spa))
return ("SUSPENDED");
switch (state) {
@@ -2926,13 +3086,13 @@ ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
"Set to ignore IO errors during free and permanently leak the space");
-ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW,
"Dead I/O check interval in milliseconds");
ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW,
"Enable deadman timer");
-ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW,
"SPA size estimate multiplication factor");
ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
@@ -2947,17 +3107,23 @@ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
"Failmode for deadman timer");
ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms,
- param_set_deadman_synctime, param_get_ulong, ZMOD_RW,
+ param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW,
"Pool sync expiration time in milliseconds");
ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms,
- param_set_deadman_ziotime, param_get_ulong, ZMOD_RW,
+ param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW,
"IO expiration time in milliseconds");
-ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW,
"Small file blocks in special vdevs depends on this much "
"free space available");
/* END CSTYLED */
ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
- param_get_int, ZMOD_RW, "Reserved free space in pool");
+ param_get_uint, ZMOD_RW, "Reserved free space in pool");
+
+ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
+ "Number of allocators per spa");
+
+ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
+ "Minimum number of CPUs per allocators");
diff --git a/sys/contrib/openzfs/module/zfs/spa_stats.c b/sys/contrib/openzfs/module/zfs/spa_stats.c
index 534ac72fee7b..17ed2a620b1e 100644
--- a/sys/contrib/openzfs/module/zfs/spa_stats.c
+++ b/sys/contrib/openzfs/module/zfs/spa_stats.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -28,22 +28,22 @@
/*
* Keeps stats on last N reads per spa_t, disabled by default.
*/
-int zfs_read_history = 0;
+static uint_t zfs_read_history = B_FALSE;
/*
* Include cache hits in history, disabled by default.
*/
-int zfs_read_history_hits = 0;
+static int zfs_read_history_hits = B_FALSE;
/*
* Keeps stats on the last 100 txgs by default.
*/
-int zfs_txg_history = 100;
+static uint_t zfs_txg_history = 100;
/*
* Keeps stats on the last N MMP updates, disabled by default.
*/
-int zfs_multihost_history = 0;
+static uint_t zfs_multihost_history = B_FALSE;
/*
* ==========================================================================
@@ -819,6 +819,41 @@ spa_state_init(spa_t *spa)
kmem_strfree(name);
}
+static int
+spa_guid_data(char *buf, size_t size, void *data)
+{
+ spa_t *spa = (spa_t *)data;
+ (void) snprintf(buf, size, "%llu\n", (u_longlong_t)spa_guid(spa));
+ return (0);
+}
+
+static void
+spa_guid_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.guid;
+ char *name;
+ kstat_t *ksp;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ ksp = kstat_create(name, 0, "guid", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ shk->kstat = ksp;
+ if (ksp) {
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_data = NULL;
+ ksp->ks_private = spa;
+ ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
+ kstat_set_raw_ops(ksp, NULL, spa_guid_data, spa_state_addr);
+ kstat_install(ksp);
+ }
+
+ kmem_strfree(name);
+}
+
static void
spa_health_destroy(spa_t *spa)
{
@@ -830,7 +865,18 @@ spa_health_destroy(spa_t *spa)
mutex_destroy(&shk->lock);
}
-static spa_iostats_t spa_iostats_template = {
+static void
+spa_guid_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.guid;
+ kstat_t *ksp = shk->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_destroy(&shk->lock);
+}
+
+static const spa_iostats_t spa_iostats_template = {
{ "trim_extents_written", KSTAT_DATA_UINT64 },
{ "trim_bytes_written", KSTAT_DATA_UINT64 },
{ "trim_extents_skipped", KSTAT_DATA_UINT64 },
@@ -950,6 +996,7 @@ spa_stats_init(spa_t *spa)
spa_tx_assign_init(spa);
spa_mmp_history_init(spa);
spa_state_init(spa);
+ spa_guid_init(spa);
spa_iostats_init(spa);
}
@@ -962,18 +1009,17 @@ spa_stats_destroy(spa_t *spa)
spa_txg_history_destroy(spa);
spa_read_history_destroy(spa);
spa_mmp_history_destroy(spa);
+ spa_guid_destroy(spa);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW,
- "Historical statistics for the last N reads");
+ZFS_MODULE_PARAM(zfs, zfs_, read_history, UINT, ZMOD_RW,
+ "Historical statistics for the last N reads");
ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW,
- "Include cache hits in read history");
+ "Include cache hits in read history");
-ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW,
- "Historical statistics for the last N txgs");
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, UINT, ZMOD_RW,
+ "Historical statistics for the last N txgs");
-ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW,
- "Historical statistics for last N multihost writes");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, UINT, ZMOD_RW,
+ "Historical statistics for last N multihost writes");
diff --git a/sys/contrib/openzfs/module/zfs/space_map.c b/sys/contrib/openzfs/module/zfs/space_map.c
index 11d4798925e4..a336ff41eadb 100644
--- a/sys/contrib/openzfs/module/zfs/space_map.c
+++ b/sys/contrib/openzfs/module/zfs/space_map.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -455,7 +455,8 @@ space_map_histogram_clear(space_map_t *sm)
if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
return;
- bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
+ memset(sm->sm_phys->smp_histogram, 0,
+ sizeof (sm->sm_phys->smp_histogram));
}
boolean_t
@@ -548,7 +549,7 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
static void
space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
- void *tag, dmu_tx_t *tx)
+ const void *tag, dmu_tx_t *tx)
{
ASSERT3U(words, !=, 0);
ASSERT3U(words, <=, 2);
@@ -896,7 +897,7 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
* will be reset. Do the same in the common case so that
* bugs related to the uncommon case do not go unnoticed.
*/
- bzero(sm->sm_phys->smp_histogram,
+ memset(sm->sm_phys->smp_histogram, 0,
sizeof (sm->sm_phys->smp_histogram));
}
diff --git a/sys/contrib/openzfs/module/zfs/space_reftree.c b/sys/contrib/openzfs/module/zfs/space_reftree.c
index 080fc6646512..ee11e162dd5b 100644
--- a/sys/contrib/openzfs/module/zfs/space_reftree.c
+++ b/sys/contrib/openzfs/module/zfs/space_reftree.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
index c9eb84bbdb12..5ce6be69be14 100644
--- a/sys/contrib/openzfs/module/zfs/txg.c
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -108,10 +108,10 @@
* now transition to the syncing state.
*/
-static void txg_sync_thread(void *arg);
-static void txg_quiesce_thread(void *arg);
+static __attribute__((noreturn)) void txg_sync_thread(void *arg);
+static __attribute__((noreturn)) void txg_quiesce_thread(void *arg);
-int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
+uint_t zfs_txg_timeout = 5; /* max seconds worth of delta per txg */
/*
* Prepare the txg subsystem.
@@ -121,7 +121,7 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
int c;
- bzero(tx, sizeof (tx_state_t));
+ memset(tx, 0, sizeof (tx_state_t));
tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
@@ -186,7 +186,7 @@ txg_fini(dsl_pool_t *dp)
vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
- bzero(tx, sizeof (tx_state_t));
+ memset(tx, 0, sizeof (tx_state_t));
}
/*
@@ -429,7 +429,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
}
static void
-txg_do_callbacks(list_t *cb_list)
+txg_do_callbacks(void *cb_list)
{
dmu_tx_do_callbacks(cb_list, 0);
@@ -479,7 +479,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
list_move_tail(cb_list, &tc->tc_callbacks[g]);
- (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+ (void) taskq_dispatch(tx->tx_commit_cb_taskq,
txg_do_callbacks, cb_list, TQ_SLEEP);
}
}
@@ -514,7 +514,7 @@ txg_has_quiesced_to_sync(dsl_pool_t *dp)
return (tx->tx_quiesced_txg != 0);
}
-static void
+static __attribute__((noreturn)) void
txg_sync_thread(void *arg)
{
dsl_pool_t *dp = arg;
@@ -551,6 +551,15 @@ txg_sync_thread(void *arg)
}
/*
+ * When we're suspended, nothing should be changing and for
+ * MMP we don't want to bump anything that would make it
+ * harder to detect if another host is changing it when
+ * resuming after a MMP suspend.
+ */
+ if (spa_suspended(spa))
+ continue;
+
+ /*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
@@ -605,7 +614,7 @@ txg_sync_thread(void *arg)
}
}
-static void
+static __attribute__((noreturn)) void
txg_quiesce_thread(void *arg)
{
dsl_pool_t *dp = arg;
@@ -895,15 +904,10 @@ txg_list_destroy(txg_list_t *tl)
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
- mutex_enter(&tl->tl_lock);
- for (int i = 0; i < TXG_SIZE; i++) {
- if (!txg_list_empty_impl(tl, i)) {
- mutex_exit(&tl->tl_lock);
- return (B_FALSE);
- }
- }
- mutex_exit(&tl->tl_lock);
- return (B_TRUE);
+ boolean_t res = B_TRUE;
+ for (int i = 0; i < TXG_SIZE; i++)
+ res &= (tl->tl_head[i] == NULL);
+ return (res);
}
/*
@@ -1069,7 +1073,5 @@ EXPORT_SYMBOL(txg_wait_callbacks);
EXPORT_SYMBOL(txg_stalled);
EXPORT_SYMBOL(txg_sync_waiting);
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, UINT, ZMOD_RW,
"Max seconds worth of delta per txg");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/uberblock.c b/sys/contrib/openzfs/module/zfs/uberblock.c
index b8857d74d810..22ee8036c473 100644
--- a/sys/contrib/openzfs/module/zfs/uberblock.c
+++ b/sys/contrib/openzfs/module/zfs/uberblock.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -70,5 +70,5 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
}
ub->ub_checkpoint_txg = 0;
- return (ub->ub_rootbp.blk_birth == txg);
+ return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg);
}
diff --git a/sys/contrib/openzfs/module/zfs/unique.c b/sys/contrib/openzfs/module/zfs/unique.c
index 0e076797a002..799e4095db33 100644
--- a/sys/contrib/openzfs/module/zfs/unique.c
+++ b/sys/contrib/openzfs/module/zfs/unique.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index 4a67ba85f58a..c74f72159dc9 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -28,7 +28,8 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Datto Inc. All rights reserved.
- * Copyright [2021] Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2021, Klara Inc.
+ * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
*/
#include <sys/zfs_context.h>
@@ -57,8 +58,10 @@
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
+#include "zfs_prop.h"
/*
* One metaslab from each (normal-class) vdev is used by the ZIL. These are
@@ -79,22 +82,22 @@
* 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
* (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
*/
-int zfs_embedded_slog_min_ms = 64;
+static uint_t zfs_embedded_slog_min_ms = 64;
/* default target for number of metaslabs per top-level vdev */
-int zfs_vdev_default_ms_count = 200;
+static uint_t zfs_vdev_default_ms_count = 200;
/* minimum number of metaslabs per top-level vdev */
-int zfs_vdev_min_ms_count = 16;
+static uint_t zfs_vdev_min_ms_count = 16;
/* practical upper limit of total metaslabs per top-level vdev */
-int zfs_vdev_ms_count_limit = 1ULL << 17;
+static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
/* lower limit for metaslab size (512M) */
-int zfs_vdev_default_ms_shift = 29;
+static uint_t zfs_vdev_default_ms_shift = 29;
/* upper limit for metaslab size (16G) */
-int zfs_vdev_max_ms_shift = 34;
+static uint_t zfs_vdev_max_ms_shift = 34;
int vdev_validate_skip = B_FALSE;
@@ -107,18 +110,23 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12);
/*
* Rate limit slow IO (delay) events to this many per second.
*/
-unsigned int zfs_slow_io_events_per_second = 20;
+static unsigned int zfs_slow_io_events_per_second = 20;
+
+/*
+ * Rate limit deadman "hung IO" events to this many per second.
+ */
+static unsigned int zfs_deadman_events_per_second = 1;
/*
* Rate limit checksum events after this many checksum errors per second.
*/
-unsigned int zfs_checksum_events_per_second = 20;
+static unsigned int zfs_checksum_events_per_second = 20;
/*
* Ignore errors during scrub/resilver. Allows to work around resilver
* upon import when there are pool errors.
*/
-int zfs_scan_ignore_errors = 0;
+static int zfs_scan_ignore_errors = 0;
/*
* vdev-wide space maps that have lots of entries written to them at
@@ -134,8 +142,16 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
*/
int zfs_nocacheflush = 0;
-uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
-uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+/*
+ * Maximum and minimum ashift values that can be automatically set based on
+ * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX
+ * is higher than the maximum value, it is intentionally limited here to not
+ * excessively impact pool space efficiency. Higher ashift values may still
+ * be forced by vdev logical ashift or by user via ashift property, but won't
+ * be set automatically as a performance optimization.
+ */
+uint_t zfs_vdev_max_auto_ashift = 14;
+uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
void
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
@@ -214,7 +230,7 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
* Virtual device management.
*/
-static vdev_ops_t *vdev_ops_table[] = {
+static vdev_ops_t *const vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
&vdev_draid_ops,
@@ -236,7 +252,7 @@ static vdev_ops_t *vdev_ops_table[] = {
static vdev_ops_t *
vdev_getops(const char *type)
{
- vdev_ops_t *ops, **opspp;
+ vdev_ops_t *ops, *const *opspp;
for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
if (strcmp(ops->vdev_op_type, type) == 0)
@@ -261,11 +277,12 @@ vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
return (vd->vdev_mg);
}
-/* ARGSUSED */
void
vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
+ (void) vd, (void) remain_rs;
+
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
}
@@ -294,13 +311,13 @@ vdev_derive_alloc_bias(const char *bias)
* all children. This is what's used by anything other than RAID-Z.
*/
uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
for (int c = 0; c < vd->vdev_children; c++) {
- csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
asize = MAX(asize, csize);
}
@@ -336,7 +353,8 @@ vdev_get_min_asize(vdev_t *vd)
* to the nearest metaslab.
*/
if (vd == vd->vdev_top)
- return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
+ return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
+ uint64_t));
return (pvd->vdev_ops->vdev_op_min_asize(pvd));
}
@@ -378,6 +396,33 @@ vdev_get_nparity(vdev_t *vd)
return (nparity);
}
+static int
+vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t objid;
+ int err;
+
+ if (vd->vdev_root_zap != 0) {
+ objid = vd->vdev_root_zap;
+ } else if (vd->vdev_top_zap != 0) {
+ objid = vd->vdev_top_zap;
+ } else if (vd->vdev_leaf_zap != 0) {
+ objid = vd->vdev_leaf_zap;
+ } else {
+ return (EINVAL);
+ }
+
+ err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
+ sizeof (uint64_t), 1, value);
+
+ if (err == ENOENT)
+ *value = vdev_prop_default_numeric(prop);
+
+ return (err);
+}
+
/*
* Get the number of data disks for a top-level vdev.
*/
@@ -472,7 +517,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
newchild = kmem_alloc(newsize, KM_SLEEP);
if (pvd->vdev_child != NULL) {
- bcopy(pvd->vdev_child, newchild, oldsize);
+ memcpy(newchild, pvd->vdev_child, oldsize);
kmem_free(pvd->vdev_child, oldsize);
}
@@ -626,11 +671,21 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
*/
zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
1);
- zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second,
+ zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
1);
zfs_ratelimit_init(&vd->vdev_checksum_rl,
&zfs_checksum_events_per_second, 1);
+ /*
+ * Default Thresholds for tuning ZED
+ */
+ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
+ vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
+ vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
+ vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
+ vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
+ vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
+
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
list_link_init(&vd->vdev_initialize_node);
@@ -652,6 +707,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -668,7 +724,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
offsetof(struct vdev, vdev_dtl_node));
vd->vdev_stat.vs_timestamp = gethrtime();
vdev_queue_init(vd);
- vdev_cache_init(vd);
return (vd);
}
@@ -683,11 +738,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
int alloctype)
{
vdev_ops_t *ops;
- char *type;
+ const char *type;
uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
- char *tmp = NULL;
+ const char *tmp = NULL;
int rc;
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
boolean_t top_level = (parent && !parent->vdev_parent);
@@ -742,7 +797,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
return (SET_ERROR(ENOTSUP));
if (top_level && alloctype == VDEV_ALLOC_ADD) {
- char *bias;
+ const char *bias;
/*
* If creating a top-level vdev, check for allocation
@@ -788,8 +843,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
- vd->vdev_path = spa_strdup(vd->vdev_path);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
+ vd->vdev_path = spa_strdup(tmp);
/*
* ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
@@ -803,18 +858,17 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
}
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
- vd->vdev_devid = spa_strdup(vd->vdev_devid);
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
- &vd->vdev_physpath) == 0)
- vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
+ vd->vdev_devid = spa_strdup(tmp);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
+ vd->vdev_physpath = spa_strdup(tmp);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
- &vd->vdev_enc_sysfs_path) == 0)
- vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
+ &tmp) == 0)
+ vd->vdev_enc_sysfs_path = spa_strdup(tmp);
- if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
- vd->vdev_fru = spa_strdup(vd->vdev_fru);
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
+ vd->vdev_fru = spa_strdup(tmp);
/*
* Set the whole_disk property. If it's not specified, leave the value
@@ -844,9 +898,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_not_present);
/*
- * Get the alignment requirement.
+ * Get the alignment requirement. Ignore pool ashift for vdev
+ * attach case.
*/
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+ if (alloctype != VDEV_ALLOC_ATTACH) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+ &vd->vdev_ashift);
+ } else {
+ vd->vdev_attaching = B_TRUE;
+ }
/*
* Retrieve the vdev creation time.
@@ -854,6 +914,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
&vd->vdev_crtxg);
+ if (vd->vdev_ops == &vdev_root_ops &&
+ (alloctype == VDEV_ALLOC_LOAD ||
+ alloctype == VDEV_ALLOC_SPLIT ||
+ alloctype == VDEV_ALLOC_ROOTPOOL)) {
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
+ &vd->vdev_root_zap);
+ }
+
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
@@ -865,10 +933,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_ms_shift);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
&vd->vdev_asize);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
+ &vd->vdev_noalloc);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
&vd->vdev_removing);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
&vd->vdev_top_zap);
+ vd->vdev_rz_expanding = nvlist_exists(nv,
+ ZPOOL_CONFIG_RAIDZ_EXPANDING);
} else {
ASSERT0(vd->vdev_top_zap);
}
@@ -943,7 +1015,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_removed);
if (vd->vdev_faulted || vd->vdev_degraded) {
- char *aux;
+ const char *aux;
vd->vdev_label_aux =
VDEV_AUX_ERR_EXCEEDED;
@@ -1040,7 +1112,6 @@ vdev_free(vdev_t *vd)
* Clean up vdev structure.
*/
vdev_queue_fini(vd);
- vdev_cache_fini(vd);
if (vd->vdev_path)
spa_strfree(vd->vdev_path);
@@ -1103,6 +1174,7 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_trim_io_lock);
cv_destroy(&vd->vdev_trim_cv);
cv_destroy(&vd->vdev_autotrim_cv);
+ cv_destroy(&vd->vdev_autotrim_kick_cv);
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
@@ -1131,7 +1203,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT(tvd == tvd->vdev_top);
- tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
tvd->vdev_ms_array = svd->vdev_ms_array;
tvd->vdev_ms_shift = svd->vdev_ms_shift;
tvd->vdev_ms_count = svd->vdev_ms_count;
@@ -1183,8 +1254,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+ ASSERT0(tvd->vdev_noalloc);
ASSERT0(tvd->vdev_removing);
ASSERT0(tvd->vdev_rebuilding);
+ tvd->vdev_noalloc = svd->vdev_noalloc;
tvd->vdev_removing = svd->vdev_removing;
tvd->vdev_rebuilding = svd->vdev_rebuilding;
tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
@@ -1200,6 +1273,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_indirect_mapping = NULL;
svd->vdev_indirect_births = NULL;
svd->vdev_obsolete_sm = NULL;
+ svd->vdev_noalloc = 0;
svd->vdev_removing = 0;
svd->vdev_rebuilding = 0;
@@ -1335,6 +1409,36 @@ vdev_remove_parent(vdev_t *cvd)
vdev_free(mvd);
}
+/*
+ * Choose GCD for spa_gcd_alloc.
+ */
+static uint64_t
+vdev_gcd(uint64_t a, uint64_t b)
+{
+ while (b != 0) {
+ uint64_t t = b;
+ b = a % b;
+ a = t;
+ }
+ return (a);
+}
+
+/*
+ * Set spa_min_alloc and spa_gcd_alloc.
+ */
+static void
+vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
+{
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
+ if (spa->spa_gcd_alloc == INT_MAX) {
+ spa->spa_gcd_alloc = min_alloc;
+ } else {
+ spa->spa_gcd_alloc = vdev_gcd(min_alloc,
+ spa->spa_gcd_alloc);
+ }
+}
+
void
vdev_metaslab_group_create(vdev_t *vd)
{
@@ -1387,8 +1491,7 @@ vdev_metaslab_group_create(vdev_t *vd)
spa->spa_min_ashift = vd->vdev_ashift;
uint64_t min_alloc = vdev_get_min_alloc(vd);
- if (min_alloc < spa->spa_min_alloc)
- spa->spa_min_alloc = min_alloc;
+ vdev_spa_set_alloc(spa, min_alloc);
}
}
}
@@ -1418,7 +1521,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
if (expanding) {
- bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+ memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
@@ -1498,11 +1601,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
/*
- * If the vdev is being removed we don't activate
- * the metaslabs since we want to ensure that no new
- * allocations are performed on this device.
+ * If the vdev is marked as non-allocating then don't
+ * activate the metaslabs since we want to ensure that
+ * no allocations are performed on this device.
*/
- if (!expanding && !vd->vdev_removing) {
+ if (vd->vdev_noalloc) {
+ /* track non-allocating vdev space */
+ spa->spa_nonallocating_dspace += spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ } else if (!expanding) {
metaslab_group_activate(vd->vdev_mg);
if (vd->vdev_log_mg != NULL)
metaslab_group_activate(vd->vdev_log_mg);
@@ -1511,13 +1618,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
- /*
- * Regardless whether this vdev was just added or it is being
- * expanded, the metaslab count has changed. Recalculate the
- * block limit.
- */
- spa_log_sm_set_blocklimit(spa);
-
return (0);
}
@@ -1565,12 +1665,12 @@ vdev_metaslab_fini(vdev_t *vd)
}
}
ASSERT0(vd->vdev_ms_count);
- ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
}
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
+ boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;
@@ -1604,6 +1704,8 @@ vdev_probe_done(zio_t *zio)
vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable;
+ vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
+ vd->vdev_cant_read, vd->vdev_cant_write);
if (vdev_readable(vd) &&
(vdev_writeable(vd) || !spa_writeable(spa))) {
@@ -1614,6 +1716,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
+
+ /*
+ * If this probe was initiated from zio pipeline, then
+ * change the state in a spa_async_request. Probes that
+ * were initiated from a vdev_open can change the state
+ * as part of the open call.
+ */
+ if (vps->vps_zio_done_probe) {
+ vd->vdev_fault_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
+ }
}
mutex_enter(&vd->vdev_probe_lock);
@@ -1663,8 +1776,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
- ZIO_FLAG_TRYHARD;
+ ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
+ vps->vps_zio_done_probe = (zio != NULL);
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@@ -1691,15 +1804,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
-
- /*
- * We can't change the vdev state in this context, so we
- * kick off an async task to do it on our behalf.
- */
- if (zio != NULL) {
- vd->vdev_probe_wanted = B_TRUE;
- spa_async_request(spa, SPA_ASYNC_PROBE);
- }
}
if (zio != NULL)
@@ -1767,6 +1871,7 @@ vdev_uses_zvols(vdev_t *vd)
static boolean_t
vdev_default_open_children_func(vdev_t *vd)
{
+ (void) vd;
return (B_TRUE);
}
@@ -1825,21 +1930,42 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
}
/*
- * Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
+ * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)
+ * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE
+ * changed, this algorithm can not change, otherwise it would inconsistently
+ * account for existing bp's. We also hard-code txg 0 for the same reason
+ * since expanded RAIDZ vdevs can use a different asize for different birth
+ * txg's.
*/
static void
vdev_set_deflate_ratio(vdev_t *vd)
{
if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
vd->vdev_deflate_ratio = (1 << 17) /
- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+ (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
+ SPA_MINBLOCKSHIFT);
}
}
/*
+ * Choose the best of two ashifts, preferring one between logical ashift
+ * (absolute minimum) and administrator defined maximum, otherwise take
+ * the biggest of the two.
+ */
+uint64_t
+vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
+{
+ if (a > logical && a <= zfs_vdev_max_auto_ashift) {
+ if (b <= logical || b > zfs_vdev_max_auto_ashift)
+ return (a);
+ else
+ return (MAX(a, b));
+ } else if (b <= logical || b > zfs_vdev_max_auto_ashift)
+ return (MAX(a, b));
+ return (b);
+}
+
+/*
* Maximize performance by inflating the configured ashift for top level
* vdevs to be as close to the physical ashift as possible while maintaining
* administrator defined limits and ensuring it doesn't go below the
@@ -1850,7 +1976,8 @@ vdev_ashift_optimize(vdev_t *vd)
{
ASSERT(vd == vd->vdev_top);
- if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ if (vd->vdev_ashift < vd->vdev_physical_ashift &&
+ vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
vd->vdev_ashift = MIN(
MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
MAX(zfs_vdev_min_auto_ashift,
@@ -1915,6 +2042,14 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
&logical_ashift, &physical_ashift);
+
+ /* Keep the device in removed state if unplugged */
+ if (error == ENOENT && vd->vdev_removed) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
+ VDEV_AUX_NONE);
+ return (error);
+ }
+
/*
* Physical volume size should never be larger than its max size, unless
* the disk has shrunk while we were reading it or the device is buggy
@@ -1986,8 +2121,8 @@ vdev_open(vdev_t *vd)
}
}
- osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
- max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
+ osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
+ max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
if (vd->vdev_children == 0) {
if (osize < SPA_MINDEVSIZE) {
@@ -2062,9 +2197,9 @@ vdev_open(vdev_t *vd)
return (SET_ERROR(EDOM));
}
- if (vd->vdev_top == vd) {
+ if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
vdev_ashift_optimize(vd);
- }
+ vd->vdev_attaching = B_FALSE;
}
if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
vd->vdev_ashift > ASHIFT_MAX)) {
@@ -2125,8 +2260,7 @@ vdev_open(vdev_t *vd)
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
uint64_t min_alloc = vdev_get_min_alloc(vd);
- if (min_alloc < spa->spa_min_alloc)
- spa->spa_min_alloc = min_alloc;
+ vdev_spa_set_alloc(spa, min_alloc);
}
/*
@@ -2371,22 +2505,36 @@ vdev_validate(vdev_t *vd)
}
static void
-vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
{
- char *old, *new;
- if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
- if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
- zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
- "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
- dvd->vdev_path, svd->vdev_path);
- spa_strfree(dvd->vdev_path);
- dvd->vdev_path = spa_strdup(svd->vdev_path);
+ if (svd != NULL && *dvd != NULL) {
+ if (strcmp(svd, *dvd) != 0) {
+ zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
+ "from '%s' to '%s'", (u_longlong_t)guid, prefix,
+ *dvd, svd);
+ spa_strfree(*dvd);
+ *dvd = spa_strdup(svd);
}
- } else if (svd->vdev_path != NULL) {
- dvd->vdev_path = spa_strdup(svd->vdev_path);
+ } else if (svd != NULL) {
+ *dvd = spa_strdup(svd);
zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
- (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+ (u_longlong_t)guid, *dvd);
}
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+ char *old, *new;
+
+ vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
+ dvd->vdev_guid);
+
+ vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
+ dvd->vdev_guid);
+
+ vdev_update_path("vdev_physpath", svd->vdev_physpath,
+ &dvd->vdev_physpath, dvd->vdev_guid);
/*
* Our enclosure sysfs path may have changed between imports
@@ -2527,8 +2675,6 @@ vdev_close(vdev_t *vd)
vd->vdev_ops->vdev_op_close(vd);
- vdev_cache_purge(vd);
-
/*
* We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
@@ -2615,6 +2761,17 @@ vdev_reopen(vdev_t *vd)
}
/*
+ * Recheck if resilver is still needed and cancel any
+ * scheduled resilver if resilver is unneeded.
+ */
+ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
+ spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
+ mutex_exit(&spa->spa_async_lock);
+ }
+
+ /*
* Reassess parent vdev's health.
*/
vdev_propagate_state(vd);
@@ -2848,6 +3005,8 @@ boolean_t
vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
+ (void) dva, (void) psize;
+
/* Set by sequential resilver. */
if (phys_birth == TXG_UNKNOWN)
return (B_TRUE);
@@ -3103,32 +3262,71 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
if (txg != 0)
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
- return;
+ } else {
+ mutex_enter(&vd->vdev_dtl_lock);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+ if (t == DTL_SCRUB) {
+ /* leaf vdevs only */
+ continue;
+ }
+ if (t == DTL_PARTIAL) {
+ /* i.e. non-zero */
+ minref = 1;
+ } else if (vdev_get_nparity(vd) != 0) {
+ /* RAIDZ, DRAID */
+ minref = vdev_get_nparity(vd) + 1;
+ } else {
+ /* any kind of mirror */
+ minref = vd->vdev_children;
+ }
+ space_reftree_create(&reftree);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ mutex_enter(&cvd->vdev_dtl_lock);
+ space_reftree_add_map(&reftree,
+ cvd->vdev_dtl[s], 1);
+ mutex_exit(&cvd->vdev_dtl_lock);
+ }
+ space_reftree_generate_map(&reftree,
+ vd->vdev_dtl[t], minref);
+ space_reftree_destroy(&reftree);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
}
- mutex_enter(&vd->vdev_dtl_lock);
- for (int t = 0; t < DTL_TYPES; t++) {
- /* account for child's outage in parent's missing map */
- int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
- if (t == DTL_SCRUB)
- continue; /* leaf vdevs only */
- if (t == DTL_PARTIAL)
- minref = 1; /* i.e. non-zero */
- else if (vdev_get_nparity(vd) != 0)
- minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
- else
- minref = vd->vdev_children; /* any kind of mirror */
- space_reftree_create(&reftree);
- for (int c = 0; c < vd->vdev_children; c++) {
- vdev_t *cvd = vd->vdev_child[c];
- mutex_enter(&cvd->vdev_dtl_lock);
- space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
- mutex_exit(&cvd->vdev_dtl_lock);
- }
- space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
- space_reftree_destroy(&reftree);
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
+ raidz_dtl_reassessed(vd);
}
- mutex_exit(&vd->vdev_dtl_lock);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and post kobj events
+ */
+void
+vdev_post_kobj_evt(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_kobj_evt_post &&
+ vd->vdev_kobj_flag == B_FALSE) {
+ vd->vdev_kobj_flag = B_TRUE;
+ vd->vdev_ops->vdev_op_kobj_evt_post(vd);
+ }
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_post_kobj_evt(vd->vdev_child[c]);
+}
+
+/*
+ * Iterate over all the vdevs except spare, and clear kobj events
+ */
+void
+vdev_clear_kobj_evt(vdev_t *vd)
+{
+ vd->vdev_kobj_flag = B_FALSE;
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_clear_kobj_evt(vd->vdev_child[c]);
}
int
@@ -3242,6 +3440,12 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
vdev_zap_allocation_data(vd, tx);
}
}
+ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
+ spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
+ if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
+ spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
+ vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
+ }
for (uint64_t i = 0; i < vd->vdev_children; i++) {
vdev_construct_zaps(vd->vdev_child[i], tx);
@@ -3469,6 +3673,12 @@ vdev_load(vdev_t *vd)
vdev_set_deflate_ratio(vd);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ error = vdev_raidz_load(vd);
+ if (error != 0)
+ return (error);
+ }
+
/*
* On spa_load path, grab the allocation bias from our zap
*/
@@ -3492,6 +3702,26 @@ vdev_load(vdev_t *vd)
}
}
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ uint64_t failfast;
+
+ error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
+ 1, &failfast);
+ if (error == 0) {
+ vd->vdev_failfast = failfast & 1;
+ } else if (error == ENOENT) {
+ vd->vdev_failfast = vdev_prop_default_numeric(
+ VDEV_PROP_FAILFAST);
+ } else {
+ vdev_dbgmsg(vd,
+ "vdev_load: zap_lookup(top_zap=%llu) "
+ "failed [error=%d]",
+ (u_longlong_t)vd->vdev_top_zap, error);
+ }
+ }
+
/*
* Load any rebuild state from the top-level vdev zap.
*/
@@ -3506,6 +3736,51 @@ vdev_load(vdev_t *vd)
}
}
+ if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
+ uint64_t zapobj;
+
+ if (vd->vdev_top_zap != 0)
+ zapobj = vd->vdev_top_zap;
+ else
+ zapobj = vd->vdev_leaf_zap;
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
+ &vd->vdev_checksum_n);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
+ &vd->vdev_checksum_t);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
+ &vd->vdev_io_n);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
+ &vd->vdev_io_t);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
+ &vd->vdev_slow_io_n);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+
+ error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
+ &vd->vdev_slow_io_t);
+ if (error && error != ENOENT)
+ vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
+ "failed [error=%d]", (u_longlong_t)zapobj, error);
+ }
+
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
@@ -3793,10 +4068,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+/*
+ * Return the amount of space that should be (or was) allocated for the given
+ * psize (compressed block size) in the given TXG. Note that for expanded
+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See
+ * vdev_raidz_asize().
+ */
+uint64_t
+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+}
+
uint64_t
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
{
- return (vd->vdev_ops->vdev_op_asize(vd, psize));
+ return (vdev_psize_to_asize_txg(vd, psize, 0));
}
/*
@@ -3912,6 +4199,36 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
return (spa_vdev_state_exit(spa, vd, 0));
}
+int
+vdev_remove_wanted(spa_t *spa, uint64_t guid)
+{
+ vdev_t *vd;
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+
+ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+ /*
+ * If the vdev is already removed, or expanding which can trigger
+ * repartition add/remove events, then don't do anything.
+ */
+ if (vd->vdev_removed || vd->vdev_expanding)
+ return (spa_vdev_state_exit(spa, NULL, 0));
+
+ /*
+ * Confirm the vdev has been removed, otherwise don't do anything.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
+ return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
+
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(spa, SPA_ASYNC_REMOVE);
+
+ return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+
/*
* Online the given vdev.
*
@@ -3932,9 +4249,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
-
wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
oldstate = vd->vdev_state;
@@ -3973,6 +4287,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
/* XXX - L2ARC 1.0 does not support expansion */
if (vd->vdev_aux)
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+ spa->spa_ccw_fail_time = 0;
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
@@ -4002,9 +4317,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
- vd->vdev_state >= VDEV_STATE_DEGRADED))
+ vd->vdev_state >= VDEV_STATE_DEGRADED)) {
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
+ /*
+ * Asynchronously detach spare vdev if resilver or
+ * rebuild is not required
+ */
+ if (vd->vdev_unspare &&
+ !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
+ !vdev_rebuild_active(tvd))
+ spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
+ }
return (spa_vdev_state_exit(spa, vd, 0));
}
@@ -4155,9 +4480,9 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vdev_clear(spa, vd->vdev_child[c]);
/*
- * It makes no sense to "clear" an indirect vdev.
+ * It makes no sense to "clear" an indirect or removed vdev.
*/
- if (!vdev_is_concrete(vd))
+ if (!vdev_is_concrete(vd) || vd->vdev_removed)
return;
/*
@@ -4297,6 +4622,8 @@ vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
static void
vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
{
+ (void) cvd;
+
int t, b;
for (t = 0; t < ZIO_TYPES; t++) {
for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
@@ -4386,11 +4713,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
- for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
- vsx->vsx_active_queue[t] =
- vd->vdev_queue.vq_class[t].vqc_active;
- vsx->vsx_pend_queue[t] = avl_numnodes(
- &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+ for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+ vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
+ vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
}
}
}
@@ -4401,12 +4726,13 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vdev_t *tvd = vd->vdev_top;
mutex_enter(&vd->vdev_stat_lock);
if (vs) {
- bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+ memcpy(vs, &vd->vdev_stat, sizeof (*vs));
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf) {
+ vs->vs_pspace = vd->vdev_psize;
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
/*
@@ -4444,15 +4770,18 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
* can expand.
*/
if (vd->vdev_aux == NULL && tvd != NULL) {
- vs->vs_esize = P2ALIGN(
+ vs->vs_esize = P2ALIGN_TYPED(
vd->vdev_max_asize - vd->vdev_asize,
- 1ULL << tvd->vdev_ms_shift);
+ 1ULL << tvd->vdev_ms_shift, uint64_t);
}
vs->vs_configured_ashift = vd->vdev_top != NULL
? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
vs->vs_logical_ashift = vd->vdev_logical_ashift;
- vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ if (vd->vdev_physical_ashift <= ASHIFT_MAX)
+ vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ else
+ vs->vs_physical_ashift = 0;
/*
* Report fragmentation and rebuild progress for top-level,
@@ -4469,6 +4798,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
+ vs->vs_noalloc = MAX(vd->vdev_noalloc,
+ tvd ? tvd->vdev_noalloc : 0);
}
vdev_get_stats_ex_impl(vd, vs, vsx);
@@ -4512,8 +4843,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
vdev_t *pvd;
uint64_t txg = zio->io_txg;
+/* Suppress ASAN false positive */
+#ifdef __SANITIZE_ADDRESS__
+ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
+ vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
+#else
vdev_stat_t *vs = &vd->vdev_stat;
vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
+#endif
zio_type_t type = zio->io_type;
int flags = zio->io_flags;
@@ -4597,11 +4934,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
/*
* TRIM ops and bytes are reported to user space as
- * ZIO_TYPE_IOCTL. This is done to preserve the
+ * ZIO_TYPE_FLUSH. This is done to preserve the
* vdev_stat_t structure layout for user space.
*/
if (type == ZIO_TYPE_TRIM)
- vs_type = ZIO_TYPE_IOCTL;
+ vs_type = ZIO_TYPE_FLUSH;
/*
* Solely for the purposes of 'zpool iostat -lqrw'
@@ -4730,6 +5067,7 @@ void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta)
{
+ (void) defer_delta;
int64_t dspace_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
@@ -5191,7 +5529,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ if ((vd->vdev_spa->spa_raidz_expand == NULL ||
+ vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
+ (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
vdev_is_concrete(vd)) {
vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
@@ -5207,9 +5547,13 @@ vdev_split(vdev_t *vd)
{
vdev_t *cvd, *pvd = vd->vdev_parent;
+ VERIFY3U(pvd->vdev_children, >, 1);
+
vdev_remove_child(pvd, vd);
vdev_compact_children(pvd);
+ ASSERT3P(pvd->vdev_child, !=, NULL);
+
cvd = pvd->vdev_child[0];
if (pvd->vdev_children == 1) {
vdev_remove_parent(cvd);
@@ -5219,7 +5563,7 @@ vdev_split(vdev_t *vd)
}
void
-vdev_deadman(vdev_t *vd, char *tag)
+vdev_deadman(vdev_t *vd, const char *tag)
{
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
@@ -5231,20 +5575,20 @@ vdev_deadman(vdev_t *vd, char *tag)
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
- if (avl_numnodes(&vq->vq_active_tree) > 0) {
+ if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
- zfs_dbgmsg("slow vdev: %s has %lu active IOs",
- vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+ zfs_dbgmsg("slow vdev: %s has %u active IOs",
+ vd->vdev_path, vq->vq_active);
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
- fio = avl_first(&vq->vq_active_tree);
+ fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);
@@ -5375,6 +5719,23 @@ vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
}
}
+static char *
+vdev_name(vdev_t *vd, char *buf, int buflen)
+{
+ if (vd->vdev_path == NULL) {
+ if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
+ strlcpy(buf, vd->vdev_spa->spa_name, buflen);
+ } else if (!vd->vdev_ops->vdev_op_leaf) {
+ snprintf(buf, buflen, "%s-%llu",
+ vd->vdev_ops->vdev_op_type,
+ (u_longlong_t)vd->vdev_id);
+ }
+ } else {
+ strlcpy(buf, vd->vdev_path, buflen);
+ }
+ return (buf);
+}
+
/*
* Look at the vdev tree and determine whether any devices are currently being
* replaced.
@@ -5404,31 +5765,730 @@ vdev_replace_in_progress(vdev_t *vdev)
return (B_FALSE);
}
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
+ uint64_t intval, zprop_source_t src)
+{
+ nvlist_t *propval;
+
+ propval = fnvlist_alloc();
+ fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
+
+ if (strval != NULL)
+ fnvlist_add_string(propval, ZPROP_VALUE, strval);
+ else
+ fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
+
+ fnvlist_add_nvlist(nvl, propname, propval);
+ nvlist_free(propval);
+}
+
+static void
+vdev_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *vd;
+ nvlist_t *nvp = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ nvpair_t *elem = NULL;
+ uint64_t vdev_guid;
+ uint64_t objid;
+ nvlist_t *nvprops;
+
+ vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
+ nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
+ vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+
+ /* this vdev could get removed while waiting for this sync task */
+ if (vd == NULL)
+ return;
+
+ /*
+ * Set vdev property values in the vdev props mos object.
+ */
+ if (vd->vdev_root_zap != 0) {
+ objid = vd->vdev_root_zap;
+ } else if (vd->vdev_top_zap != 0) {
+ objid = vd->vdev_top_zap;
+ } else if (vd->vdev_leaf_zap != 0) {
+ objid = vd->vdev_leaf_zap;
+ } else {
+ panic("unexpected vdev type");
+ }
+
+ mutex_enter(&spa->spa_props_lock);
+
+ while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+ uint64_t intval;
+ const char *strval;
+ vdev_prop_t prop;
+ const char *propname = nvpair_name(elem);
+ zprop_type_t proptype;
+
+ switch (prop = vdev_name_to_prop(propname)) {
+ case VDEV_PROP_USERPROP:
+ if (vdev_prop_user(propname)) {
+ strval = fnvpair_value_string(elem);
+ if (strlen(strval) == 0) {
+ /* remove the property if value == "" */
+ (void) zap_remove(mos, objid, propname,
+ tx);
+ } else {
+ VERIFY0(zap_update(mos, objid, propname,
+ 1, strlen(strval) + 1, strval, tx));
+ }
+ spa_history_log_internal(spa, "vdev set", tx,
+ "vdev_guid=%llu: %s=%s",
+ (u_longlong_t)vdev_guid, nvpair_name(elem),
+ strval);
+ }
+ break;
+ default:
+ /* normalize the property name */
+ propname = vdev_prop_to_name(prop);
+ proptype = vdev_prop_get_type(prop);
+
+ if (nvpair_type(elem) == DATA_TYPE_STRING) {
+ ASSERT(proptype == PROP_TYPE_STRING);
+ strval = fnvpair_value_string(elem);
+ VERIFY0(zap_update(mos, objid, propname,
+ 1, strlen(strval) + 1, strval, tx));
+ spa_history_log_internal(spa, "vdev set", tx,
+ "vdev_guid=%llu: %s=%s",
+ (u_longlong_t)vdev_guid, nvpair_name(elem),
+ strval);
+ } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+ intval = fnvpair_value_uint64(elem);
+
+ if (proptype == PROP_TYPE_INDEX) {
+ const char *unused;
+ VERIFY0(vdev_prop_index_to_string(
+ prop, intval, &unused));
+ }
+ VERIFY0(zap_update(mos, objid, propname,
+ sizeof (uint64_t), 1, &intval, tx));
+ spa_history_log_internal(spa, "vdev set", tx,
+ "vdev_guid=%llu: %s=%lld",
+ (u_longlong_t)vdev_guid,
+ nvpair_name(elem), (longlong_t)intval);
+ } else {
+ panic("invalid vdev property type %u",
+ nvpair_type(elem));
+ }
+ }
+
+ }
+
+ mutex_exit(&spa->spa_props_lock);
+}
+
+int
+vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa = vd->vdev_spa;
+ nvpair_t *elem = NULL;
+ uint64_t vdev_guid;
+ nvlist_t *nvprops;
+ int error = 0;
+
+ ASSERT(vd != NULL);
+
+ /* Check that vdev has a zap we can use */
+ if (vd->vdev_root_zap == 0 &&
+ vd->vdev_top_zap == 0 &&
+ vd->vdev_leaf_zap == 0)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
+ &vdev_guid) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
+ &nvprops) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
+ return (SET_ERROR(EINVAL));
+
+ while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+ const char *propname = nvpair_name(elem);
+ vdev_prop_t prop = vdev_name_to_prop(propname);
+ uint64_t intval = 0;
+ const char *strval = NULL;
+
+ if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
+ error = EINVAL;
+ goto end;
+ }
+
+ if (vdev_prop_readonly(prop)) {
+ error = EROFS;
+ goto end;
+ }
+
+ /* Special Processing */
+ switch (prop) {
+ case VDEV_PROP_PATH:
+ if (vd->vdev_path == NULL) {
+ error = EROFS;
+ break;
+ }
+ if (nvpair_value_string(elem, &strval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ /* New path must start with /dev/ */
+ if (strncmp(strval, "/dev/", 5)) {
+ error = EINVAL;
+ break;
+ }
+ error = spa_vdev_setpath(spa, vdev_guid, strval);
+ break;
+ case VDEV_PROP_ALLOCATING:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ if (intval != vd->vdev_noalloc)
+ break;
+ if (intval == 0)
+ error = spa_vdev_noalloc(spa, vdev_guid);
+ else
+ error = spa_vdev_alloc(spa, vdev_guid);
+ break;
+ case VDEV_PROP_FAILFAST:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_failfast = intval & 1;
+ break;
+ case VDEV_PROP_CHECKSUM_N:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_checksum_n = intval;
+ break;
+ case VDEV_PROP_CHECKSUM_T:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_checksum_t = intval;
+ break;
+ case VDEV_PROP_IO_N:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_io_n = intval;
+ break;
+ case VDEV_PROP_IO_T:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_io_t = intval;
+ break;
+ case VDEV_PROP_SLOW_IO_N:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_slow_io_n = intval;
+ break;
+ case VDEV_PROP_SLOW_IO_T:
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_slow_io_t = intval;
+ break;
+ default:
+ /* Most processing is done in vdev_props_set_sync */
+ break;
+ }
+end:
+ if (error != 0) {
+ intval = error;
+ vdev_prop_add_list(outnvl, propname, strval, intval, 0);
+ return (error);
+ }
+ }
+
+ return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
+ innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+int
+vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ int err = 0;
+ uint64_t objid;
+ uint64_t vdev_guid;
+ nvpair_t *elem = NULL;
+ nvlist_t *nvprops = NULL;
+ uint64_t intval = 0;
+ char *strval = NULL;
+ const char *propname = NULL;
+ vdev_prop_t prop;
+
+ ASSERT(vd != NULL);
+ ASSERT(mos != NULL);
+
+ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
+ &vdev_guid) != 0)
+ return (SET_ERROR(EINVAL));
+
+ nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
+
+ if (vd->vdev_root_zap != 0) {
+ objid = vd->vdev_root_zap;
+ } else if (vd->vdev_top_zap != 0) {
+ objid = vd->vdev_top_zap;
+ } else if (vd->vdev_leaf_zap != 0) {
+ objid = vd->vdev_leaf_zap;
+ } else {
+ return (SET_ERROR(EINVAL));
+ }
+ ASSERT(objid != 0);
+
+ mutex_enter(&spa->spa_props_lock);
+
+ if (nvprops != NULL) {
+ char namebuf[64] = { 0 };
+
+ while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+ intval = 0;
+ strval = NULL;
+ propname = nvpair_name(elem);
+ prop = vdev_name_to_prop(propname);
+ zprop_source_t src = ZPROP_SRC_DEFAULT;
+ uint64_t integer_size, num_integers;
+
+ switch (prop) {
+ /* Special Read-only Properties */
+ case VDEV_PROP_NAME:
+ strval = vdev_name(vd, namebuf,
+ sizeof (namebuf));
+ if (strval == NULL)
+ continue;
+ vdev_prop_add_list(outnvl, propname, strval, 0,
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_CAPACITY:
+ /* percent used */
+ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
+ (vd->vdev_stat.vs_alloc * 100 /
+ vd->vdev_stat.vs_dspace);
+ vdev_prop_add_list(outnvl, propname, NULL,
+ intval, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_STATE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_state, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_GUID:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_guid, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_ASIZE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_asize, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_PSIZE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_psize, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_ASHIFT:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_ashift, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_SIZE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_FREE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_dspace -
+ vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_ALLOCATED:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_EXPANDSZ:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_FRAGMENTATION:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_fragmentation,
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_PARITY:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vdev_get_nparity(vd), ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_PATH:
+ if (vd->vdev_path == NULL)
+ continue;
+ vdev_prop_add_list(outnvl, propname,
+ vd->vdev_path, 0, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_DEVID:
+ if (vd->vdev_devid == NULL)
+ continue;
+ vdev_prop_add_list(outnvl, propname,
+ vd->vdev_devid, 0, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_PHYS_PATH:
+ if (vd->vdev_physpath == NULL)
+ continue;
+ vdev_prop_add_list(outnvl, propname,
+ vd->vdev_physpath, 0, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_ENC_PATH:
+ if (vd->vdev_enc_sysfs_path == NULL)
+ continue;
+ vdev_prop_add_list(outnvl, propname,
+ vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_FRU:
+ if (vd->vdev_fru == NULL)
+ continue;
+ vdev_prop_add_list(outnvl, propname,
+ vd->vdev_fru, 0, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_PARENT:
+ if (vd->vdev_parent != NULL) {
+ strval = vdev_name(vd->vdev_parent,
+ namebuf, sizeof (namebuf));
+ vdev_prop_add_list(outnvl, propname,
+ strval, 0, ZPROP_SRC_NONE);
+ }
+ continue;
+ case VDEV_PROP_CHILDREN:
+ if (vd->vdev_children > 0)
+ strval = kmem_zalloc(ZAP_MAXVALUELEN,
+ KM_SLEEP);
+ for (uint64_t i = 0; i < vd->vdev_children;
+ i++) {
+ const char *vname;
+
+ vname = vdev_name(vd->vdev_child[i],
+ namebuf, sizeof (namebuf));
+ if (vname == NULL)
+ vname = "(unknown)";
+ if (strlen(strval) > 0)
+ strlcat(strval, ",",
+ ZAP_MAXVALUELEN);
+ strlcat(strval, vname, ZAP_MAXVALUELEN);
+ }
+ if (strval != NULL) {
+ vdev_prop_add_list(outnvl, propname,
+ strval, 0, ZPROP_SRC_NONE);
+ kmem_free(strval, ZAP_MAXVALUELEN);
+ }
+ continue;
+ case VDEV_PROP_NUMCHILDREN:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_children, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_READ_ERRORS:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_read_errors,
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_WRITE_ERRORS:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_write_errors,
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_CHECKSUM_ERRORS:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_checksum_errors,
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_INITIALIZE_ERRORS:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_initialize_errors,
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_OPS_NULL:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_OPS_READ:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_OPS_WRITE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_OPS_FREE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_OPS_CLAIM:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_OPS_TRIM:
+ /*
+ * TRIM ops and bytes are reported to user
+ * space as ZIO_TYPE_FLUSH. This is done to
+ * preserve the vdev_stat_t structure layout
+ * for user space.
+ */
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_BYTES_NULL:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_BYTES_READ:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_BYTES_WRITE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_BYTES_FREE:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_BYTES_CLAIM:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_BYTES_TRIM:
+ /*
+ * TRIM ops and bytes are reported to user
+ * space as ZIO_TYPE_FLUSH. This is done to
+ * preserve the vdev_stat_t structure layout
+ * for user space.
+ */
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
+ ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_REMOVING:
+ vdev_prop_add_list(outnvl, propname, NULL,
+ vd->vdev_removing, ZPROP_SRC_NONE);
+ continue;
+ case VDEV_PROP_RAIDZ_EXPANDING:
+ /* Only expose this for raidz */
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_prop_add_list(outnvl, propname,
+ NULL, vd->vdev_rz_expanding,
+ ZPROP_SRC_NONE);
+ }
+ continue;
+ /* Numeric Properites */
+ case VDEV_PROP_ALLOCATING:
+ /* Leaf vdevs cannot have this property */
+ if (vd->vdev_mg == NULL &&
+ vd->vdev_top != NULL) {
+ src = ZPROP_SRC_NONE;
+ intval = ZPROP_BOOLEAN_NA;
+ } else {
+ err = vdev_prop_get_int(vd, prop,
+ &intval);
+ if (err && err != ENOENT)
+ break;
+
+ if (intval ==
+ vdev_prop_default_numeric(prop))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+ }
+
+ vdev_prop_add_list(outnvl, propname, NULL,
+ intval, src);
+ break;
+ case VDEV_PROP_FAILFAST:
+ src = ZPROP_SRC_LOCAL;
+ strval = NULL;
+
+ err = zap_lookup(mos, objid, nvpair_name(elem),
+ sizeof (uint64_t), 1, &intval);
+ if (err == ENOENT) {
+ intval = vdev_prop_default_numeric(
+ prop);
+ err = 0;
+ } else if (err) {
+ break;
+ }
+ if (intval == vdev_prop_default_numeric(prop))
+ src = ZPROP_SRC_DEFAULT;
+
+ vdev_prop_add_list(outnvl, propname, strval,
+ intval, src);
+ break;
+ case VDEV_PROP_CHECKSUM_N:
+ case VDEV_PROP_CHECKSUM_T:
+ case VDEV_PROP_IO_N:
+ case VDEV_PROP_IO_T:
+ case VDEV_PROP_SLOW_IO_N:
+ case VDEV_PROP_SLOW_IO_T:
+ err = vdev_prop_get_int(vd, prop, &intval);
+ if (err && err != ENOENT)
+ break;
+
+ if (intval == vdev_prop_default_numeric(prop))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+
+ vdev_prop_add_list(outnvl, propname, NULL,
+ intval, src);
+ break;
+ /* Text Properties */
+ case VDEV_PROP_COMMENT:
+ /* Exists in the ZAP below */
+ /* FALLTHRU */
+ case VDEV_PROP_USERPROP:
+ /* User Properites */
+ src = ZPROP_SRC_LOCAL;
+
+ err = zap_length(mos, objid, nvpair_name(elem),
+ &integer_size, &num_integers);
+ if (err)
+ break;
+
+ switch (integer_size) {
+ case 8:
+ /* User properties cannot be integers */
+ err = EINVAL;
+ break;
+ case 1:
+ /* string property */
+ strval = kmem_alloc(num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos, objid,
+ nvpair_name(elem), 1,
+ num_integers, strval);
+ if (err) {
+ kmem_free(strval,
+ num_integers);
+ break;
+ }
+ vdev_prop_add_list(outnvl, propname,
+ strval, 0, src);
+ kmem_free(strval, num_integers);
+ break;
+ }
+ break;
+ default:
+ err = ENOENT;
+ break;
+ }
+ if (err)
+ break;
+ }
+ } else {
+ /*
+ * Get all properties from the MOS vdev property object.
+ */
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, mos, objid);
+ (err = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ intval = 0;
+ strval = NULL;
+ zprop_source_t src = ZPROP_SRC_DEFAULT;
+ propname = za.za_name;
+
+ switch (za.za_integer_length) {
+ case 8:
+ /* We do not allow integer user properties */
+ /* This is likely an internal value */
+ break;
+ case 1:
+ /* string property */
+ strval = kmem_alloc(za.za_num_integers,
+ KM_SLEEP);
+ err = zap_lookup(mos, objid, za.za_name, 1,
+ za.za_num_integers, strval);
+ if (err) {
+ kmem_free(strval, za.za_num_integers);
+ break;
+ }
+ vdev_prop_add_list(outnvl, propname, strval, 0,
+ src);
+ kmem_free(strval, za.za_num_integers);
+ break;
+
+ default:
+ break;
+ }
+ }
+ zap_cursor_fini(&zc);
+ }
+
+ mutex_exit(&spa->spa_props_lock);
+ if (err && err != ENOENT) {
+ return (err);
+ }
+
+ return (0);
+}
+
EXPORT_SYMBOL(vdev_fault);
EXPORT_SYMBOL(vdev_degrade);
EXPORT_SYMBOL(vdev_online);
EXPORT_SYMBOL(vdev_offline);
EXPORT_SYMBOL(vdev_clear);
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
"Target number of metaslabs per top-level vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
- "Default limit for metaslab size");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
+ "Default lower limit for metaslab size");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
+ "Default upper limit for metaslab size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
"Minimum number of metaslabs per top-level vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
"Practical upper limit of total metaslabs per top-level vdev");
ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
"Rate limit slow IO (delay) events to this many per second");
+ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
+ "Rate limit hung IO (deadman) events to this many per second");
+
+/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
"Rate limit checksum events to this many checksum errors per second "
- "(do not set below zed threshold).");
+ "(do not set below ZED threshold).");
+/* END CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
"Ignore errors during resilver/scrub");
@@ -5439,15 +6499,16 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
"Disable cache flushes");
-ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
"Minimum number of metaslabs required to dedicate one for log blocks");
+/* BEGIN CSTYLED */
ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
- param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
+ param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
"Minimum ashift used when creating new top-level vdevs");
ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
- param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
+ param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
"Maximum ashift used when optimizing for logical -> physical sector "
"size on new top-level vdevs");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c
deleted file mode 100644
index 6e82184b800d..000000000000
--- a/sys/contrib/openzfs/module/zfs/vdev_cache.c
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
- */
-
-#include <sys/zfs_context.h>
-#include <sys/spa.h>
-#include <sys/vdev_impl.h>
-#include <sys/zio.h>
-#include <sys/kstat.h>
-#include <sys/abd.h>
-
-/*
- * Virtual device read-ahead caching.
- *
- * This file implements a simple LRU read-ahead cache. When the DMU reads
- * a given block, it will often want other, nearby blocks soon thereafter.
- * We take advantage of this by reading a larger disk region and caching
- * the result. In the best case, this can turn 128 back-to-back 512-byte
- * reads into a single 64k read followed by 127 cache hits; this reduces
- * latency dramatically. In the worst case, it can turn an isolated 512-byte
- * read into a 64k read, which doesn't affect latency all that much but is
- * terribly wasteful of bandwidth. A more intelligent version of the cache
- * could keep track of access patterns and not do read-ahead unless it sees
- * at least two temporally close I/Os to the same region. Currently, only
- * metadata I/O is inflated. A further enhancement could take advantage of
- * more semantic information about the I/O. And it could use something
- * faster than an AVL tree; that was chosen solely for convenience.
- *
- * There are five cache operations: allocate, fill, read, write, evict.
- *
- * (1) Allocate. This reserves a cache entry for the specified region.
- * We separate the allocate and fill operations so that multiple threads
- * don't generate I/O for the same cache miss.
- *
- * (2) Fill. When the I/O for a cache miss completes, the fill routine
- * places the data in the previously allocated cache entry.
- *
- * (3) Read. Read data from the cache.
- *
- * (4) Write. Update cache contents after write completion.
- *
- * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
- * if the total cache size exceeds zfs_vdev_cache_size.
- */
-
-/*
- * These tunables are for performance analysis.
- */
-/*
- * All i/os smaller than zfs_vdev_cache_max will be turned into
- * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
- * track buffer). At most zfs_vdev_cache_size bytes will be kept in each
- * vdev's vdev_cache.
- *
- * TODO: Note that with the current ZFS code, it turns out that the
- * vdev cache is not helpful, and in some cases actually harmful. It
- * is better if we disable this. Once some time has passed, we should
- * actually remove this to simplify the code. For now we just disable
- * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11
- * has made these same changes.
- */
-int zfs_vdev_cache_max = 1<<14; /* 16KB */
-int zfs_vdev_cache_size = 0;
-int zfs_vdev_cache_bshift = 16;
-
-#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
-
-kstat_t *vdc_ksp = NULL;
-
-typedef struct vdc_stats {
- kstat_named_t vdc_stat_delegations;
- kstat_named_t vdc_stat_hits;
- kstat_named_t vdc_stat_misses;
-} vdc_stats_t;
-
-static vdc_stats_t vdc_stats = {
- { "delegations", KSTAT_DATA_UINT64 },
- { "hits", KSTAT_DATA_UINT64 },
- { "misses", KSTAT_DATA_UINT64 }
-};
-
-#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64);
-
-static inline int
-vdev_cache_offset_compare(const void *a1, const void *a2)
-{
- const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
- const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
-
- return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
-}
-
-static int
-vdev_cache_lastused_compare(const void *a1, const void *a2)
-{
- const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
- const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
-
- int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
- if (likely(cmp))
- return (cmp);
-
- /*
- * Among equally old entries, sort by offset to ensure uniqueness.
- */
- return (vdev_cache_offset_compare(a1, a2));
-}
-
-/*
- * Evict the specified entry from the cache.
- */
-static void
-vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
-{
- ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT3P(ve->ve_fill_io, ==, NULL);
- ASSERT3P(ve->ve_abd, !=, NULL);
-
- avl_remove(&vc->vc_lastused_tree, ve);
- avl_remove(&vc->vc_offset_tree, ve);
- abd_free(ve->ve_abd);
- kmem_free(ve, sizeof (vdev_cache_entry_t));
-}
-
-/*
- * Allocate an entry in the cache. At the point we don't have the data,
- * we're just creating a placeholder so that multiple threads don't all
- * go off and read the same blocks.
- */
-static vdev_cache_entry_t *
-vdev_cache_allocate(zio_t *zio)
-{
- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
- uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
- vdev_cache_entry_t *ve;
-
- ASSERT(MUTEX_HELD(&vc->vc_lock));
-
- if (zfs_vdev_cache_size == 0)
- return (NULL);
-
- /*
- * If adding a new entry would exceed the cache size,
- * evict the oldest entry (LRU).
- */
- if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
- zfs_vdev_cache_size) {
- ve = avl_first(&vc->vc_lastused_tree);
- if (ve->ve_fill_io != NULL)
- return (NULL);
- ASSERT3U(ve->ve_hits, !=, 0);
- vdev_cache_evict(vc, ve);
- }
-
- ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
- ve->ve_offset = offset;
- ve->ve_lastused = ddi_get_lbolt();
- ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
-
- avl_add(&vc->vc_offset_tree, ve);
- avl_add(&vc->vc_lastused_tree, ve);
-
- return (ve);
-}
-
-static void
-vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
-{
- uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
-
- ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT3P(ve->ve_fill_io, ==, NULL);
-
- if (ve->ve_lastused != ddi_get_lbolt()) {
- avl_remove(&vc->vc_lastused_tree, ve);
- ve->ve_lastused = ddi_get_lbolt();
- avl_add(&vc->vc_lastused_tree, ve);
- }
-
- ve->ve_hits++;
- abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
-}
-
-/*
- * Fill a previously allocated cache entry with data.
- */
-static void
-vdev_cache_fill(zio_t *fio)
-{
- vdev_t *vd = fio->io_vd;
- vdev_cache_t *vc = &vd->vdev_cache;
- vdev_cache_entry_t *ve = fio->io_private;
- zio_t *pio;
-
- ASSERT3U(fio->io_size, ==, VCBS);
-
- /*
- * Add data to the cache.
- */
- mutex_enter(&vc->vc_lock);
-
- ASSERT3P(ve->ve_fill_io, ==, fio);
- ASSERT3U(ve->ve_offset, ==, fio->io_offset);
- ASSERT3P(ve->ve_abd, ==, fio->io_abd);
-
- ve->ve_fill_io = NULL;
-
- /*
- * Even if this cache line was invalidated by a missed write update,
- * any reads that were queued up before the missed update are still
- * valid, so we can satisfy them from this line before we evict it.
- */
- zio_link_t *zl = NULL;
- while ((pio = zio_walk_parents(fio, &zl)) != NULL)
- vdev_cache_hit(vc, ve, pio);
-
- if (fio->io_error || ve->ve_missed_update)
- vdev_cache_evict(vc, ve);
-
- mutex_exit(&vc->vc_lock);
-}
-
-/*
- * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
- */
-boolean_t
-vdev_cache_read(zio_t *zio)
-{
- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
- vdev_cache_entry_t *ve, *ve_search;
- uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
- zio_t *fio;
- uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS);
-
- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
-
- if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
- return (B_FALSE);
-
- if (zio->io_size > zfs_vdev_cache_max)
- return (B_FALSE);
-
- /*
- * If the I/O straddles two or more cache blocks, don't cache it.
- */
- if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
- return (B_FALSE);
-
- ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
-
- mutex_enter(&vc->vc_lock);
-
- ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
- ve_search->ve_offset = cache_offset;
- ve = avl_find(&vc->vc_offset_tree, ve_search, NULL);
- kmem_free(ve_search, sizeof (vdev_cache_entry_t));
-
- if (ve != NULL) {
- if (ve->ve_missed_update) {
- mutex_exit(&vc->vc_lock);
- return (B_FALSE);
- }
-
- if ((fio = ve->ve_fill_io) != NULL) {
- zio_vdev_io_bypass(zio);
- zio_add_child(zio, fio);
- mutex_exit(&vc->vc_lock);
- VDCSTAT_BUMP(vdc_stat_delegations);
- return (B_TRUE);
- }
-
- vdev_cache_hit(vc, ve, zio);
- zio_vdev_io_bypass(zio);
-
- mutex_exit(&vc->vc_lock);
- VDCSTAT_BUMP(vdc_stat_hits);
- return (B_TRUE);
- }
-
- ve = vdev_cache_allocate(zio);
-
- if (ve == NULL) {
- mutex_exit(&vc->vc_lock);
- return (B_FALSE);
- }
-
- fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
- ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
- ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
-
- ve->ve_fill_io = fio;
- zio_vdev_io_bypass(zio);
- zio_add_child(zio, fio);
-
- mutex_exit(&vc->vc_lock);
- zio_nowait(fio);
- VDCSTAT_BUMP(vdc_stat_misses);
-
- return (B_TRUE);
-}
-
-/*
- * Update cache contents upon write completion.
- */
-void
-vdev_cache_write(zio_t *zio)
-{
- vdev_cache_t *vc = &zio->io_vd->vdev_cache;
- vdev_cache_entry_t *ve, ve_search;
- uint64_t io_start = zio->io_offset;
- uint64_t io_end = io_start + zio->io_size;
- uint64_t min_offset = P2ALIGN(io_start, VCBS);
- uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
- avl_index_t where;
-
- ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-
- mutex_enter(&vc->vc_lock);
-
- ve_search.ve_offset = min_offset;
- ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
-
- if (ve == NULL)
- ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
-
- while (ve != NULL && ve->ve_offset < max_offset) {
- uint64_t start = MAX(ve->ve_offset, io_start);
- uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
-
- if (ve->ve_fill_io != NULL) {
- ve->ve_missed_update = 1;
- } else {
- abd_copy_off(ve->ve_abd, zio->io_abd,
- start - ve->ve_offset, start - io_start,
- end - start);
- }
- ve = AVL_NEXT(&vc->vc_offset_tree, ve);
- }
- mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_purge(vdev_t *vd)
-{
- vdev_cache_t *vc = &vd->vdev_cache;
- vdev_cache_entry_t *ve;
-
- mutex_enter(&vc->vc_lock);
- while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
- vdev_cache_evict(vc, ve);
- mutex_exit(&vc->vc_lock);
-}
-
-void
-vdev_cache_init(vdev_t *vd)
-{
- vdev_cache_t *vc = &vd->vdev_cache;
-
- mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
-
- avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
- sizeof (vdev_cache_entry_t),
- offsetof(struct vdev_cache_entry, ve_offset_node));
-
- avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
- sizeof (vdev_cache_entry_t),
- offsetof(struct vdev_cache_entry, ve_lastused_node));
-}
-
-void
-vdev_cache_fini(vdev_t *vd)
-{
- vdev_cache_t *vc = &vd->vdev_cache;
-
- vdev_cache_purge(vd);
-
- avl_destroy(&vc->vc_offset_tree);
- avl_destroy(&vc->vc_lastused_tree);
-
- mutex_destroy(&vc->vc_lock);
-}
-
-void
-vdev_cache_stat_init(void)
-{
- vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
- KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
- if (vdc_ksp != NULL) {
- vdc_ksp->ks_data = &vdc_stats;
- kstat_install(vdc_ksp);
- }
-}
-
-void
-vdev_cache_stat_fini(void)
-{
- if (vdc_ksp != NULL) {
- kstat_delete(vdc_ksp);
- vdc_ksp = NULL;
- }
-}
-
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, INT, ZMOD_RW,
- "Inflate reads small than max");
-
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, INT, ZMOD_RD,
- "Total size of the per-disk cache");
-
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, INT, ZMOD_RW,
- "Shift size to inflate reads too");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index b8f82d52e8f0..13bb33cc6871 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -541,7 +541,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
int
vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
{
- for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) {
+ for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) {
if (draid_maps[i].dm_children == children) {
*mapp = &draid_maps[i];
return (0);
@@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc,
* i.e. vdev_draid_psize_to_asize().
*/
static uint64_t
-vdev_draid_asize(vdev_t *vd, uint64_t psize)
+vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
+ (void) txg;
vdev_draid_config_t *vdc = vd->vdev_tsd;
uint64_t ashift = vd->vdev_ashift;
@@ -842,6 +843,53 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
}
/*
+ * Verify that all empty sectors are zero filled before using them to
+ * calculate parity. Otherwise, silent corruption in an empty sector will
+ * result in bad parity being generated. That bad parity will then be
+ * considered authoritative and overwrite the good parity on disk. This
+ * is possible because the checksum is only calculated over the data,
+ * thus it cannot be used to detect damage in empty sectors.
+ */
+int
+vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr)
+{
+ uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+ uint64_t parity_size = rr->rr_col[0].rc_size;
+ uint64_t skip_off = parity_size - skip_size;
+ uint64_t empty_off = 0;
+ int ret = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ ASSERT3P(rr->rr_abd_empty, !=, NULL);
+ ASSERT3U(rr->rr_bigcols, >, 0);
+
+ void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP);
+
+ for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ ASSERT3U(rc->rc_size, ==, parity_size);
+
+ if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off,
+ skip_size) != 0) {
+ vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
+ abd_zero_off(rc->rc_abd, skip_off, skip_size);
+ rc->rc_error = SET_ERROR(ECKSUM);
+ ret++;
+ }
+
+ empty_off += skip_size;
+ }
+
+ ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty));
+
+ kmem_free(zero_buf, skip_size);
+
+ return (ret);
+}
+
+/*
* Given a logical address within a dRAID configuration, return the physical
* address on the first drive in the group that this address maps to
* (at position 'start' in permutation number 'perm').
@@ -913,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
vdev_draid_config_t *vdc = vd->vdev_tsd;
uint64_t ashift = vd->vdev_top->vdev_ashift;
uint64_t io_size = abd_size;
- uint64_t io_asize = vdev_draid_asize(vd, io_size);
+ uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);
uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
@@ -976,15 +1024,11 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
/* The total number of data and parity sectors for this I/O. */
uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
- raidz_row_t *rr;
- rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP);
- rr->rr_cols = groupwidth;
- rr->rr_scols = groupwidth;
+ ASSERT3U(vdc->vdc_nparity, >, 0);
+
+ raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth);
rr->rr_bigcols = bc;
- rr->rr_missingdata = 0;
- rr->rr_missingparity = 0;
rr->rr_firstdatacol = vdc->vdc_nparity;
- rr->rr_abd_empty = NULL;
#ifdef ZFS_DEBUG
rr->rr_offset = io_offset;
rr->rr_size = io_size;
@@ -1004,14 +1048,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
rc->rc_offset = physical_offset;
- rc->rc_abd = NULL;
- rc->rc_orig_data = NULL;
- rc->rc_error = 0;
- rc->rc_tried = 0;
- rc->rc_skipped = 0;
- rc->rc_force_repair = 0;
- rc->rc_allow_repair = 1;
- rc->rc_need_orig_restore = B_FALSE;
if (q == 0 && i >= bc)
rc->rc_size = 0;
@@ -1080,7 +1116,7 @@ vdev_draid_map_alloc(zio_t *zio)
if (size < abd_size) {
vdev_t *vd = zio->io_vd;
- io_offset += vdev_draid_asize(vd, size);
+ io_offset += vdev_draid_asize(vd, size, 0);
abd_offset += size;
abd_size -= size;
nrows++;
@@ -1102,7 +1138,6 @@ vdev_draid_map_alloc(zio_t *zio)
rm->rm_row[0] = rr[0];
if (nrows == 2)
rm->rm_row[1] = rr[1];
-
return (rm);
}
@@ -1449,8 +1484,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
- physical_ashift = MAX(physical_ashift,
- cvd->vdev_physical_ashift);
+ }
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ continue;
+ physical_ashift = vdev_best_ashift(logical_ashift,
+ physical_ashift, cvd->vdev_physical_ashift);
}
*asizep = asize;
@@ -1678,7 +1719,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
uint64_t nparity = vdc->vdc_nparity;
for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
- bzero(path, sizeof (path));
+ memset(path, 0, sizeof (path));
(void) snprintf(path, sizeof (path) - 1,
"%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
(u_longlong_t)nparity,
@@ -1707,7 +1748,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
if (n > 0) {
(void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- new_spares, n);
+ (const nvlist_t **)new_spares, n);
}
for (int i = 0; i < n; i++)
@@ -1728,7 +1769,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
uint64_t offset = DVA_GET_OFFSET(dva);
- uint64_t asize = vdev_draid_asize(vd, psize);
+ uint64_t asize = vdev_draid_asize(vd, psize, 0);
if (phys_birth == TXG_UNKNOWN) {
/*
@@ -1785,7 +1826,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = rr->rr_offset;
logical_rs.rs_end = logical_rs.rs_start +
- vdev_draid_asize(vd, rr->rr_size);
+ vdev_draid_asize(vd, rr->rr_size, 0);
raidz_col_t *rc = &rr->rr_col[col];
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
@@ -2154,6 +2195,7 @@ vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
static int
vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
{
+ (void) spa;
uint64_t ndata, nparity, nspares, ngroups;
int error;
@@ -2382,7 +2424,6 @@ vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
return (cvd);
}
-/* ARGSUSED */
static void
vdev_draid_spare_close(vdev_t *vd)
{
@@ -2507,24 +2548,20 @@ vdev_draid_read_config_spare(vdev_t *vd)
}
/*
- * Handle any ioctl requested of the distributed spare. Only flushes
- * are supported in which case all children must be flushed.
+ * Handle any flush requested of the distributed spare. All children must be
+ * flushed.
*/
static int
-vdev_draid_spare_ioctl(zio_t *zio)
+vdev_draid_spare_flush(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
int error = 0;
- if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
- for (int c = 0; c < vd->vdev_children; c++) {
- zio_nowait(zio_vdev_child_io(zio, NULL,
- vd->vdev_child[c], zio->io_offset, zio->io_abd,
- zio->io_size, zio->io_type, zio->io_priority, 0,
- vdev_draid_spare_child_done, zio));
- }
- } else {
- error = SET_ERROR(ENOTSUP);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[c], zio->io_offset, zio->io_abd,
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
}
return (error);
@@ -2555,8 +2592,8 @@ vdev_draid_spare_io_start(zio_t *zio)
}
switch (zio->io_type) {
- case ZIO_TYPE_IOCTL:
- zio->io_error = vdev_draid_spare_ioctl(zio);
+ case ZIO_TYPE_FLUSH:
+ zio->io_error = vdev_draid_spare_flush(zio);
break;
case ZIO_TYPE_WRITE:
@@ -2641,10 +2678,10 @@ vdev_draid_spare_io_start(zio_t *zio)
zio_execute(zio);
}
-/* ARGSUSED */
static void
vdev_draid_spare_io_done(zio_t *zio)
{
+ (void) zio;
}
/*
@@ -2665,7 +2702,7 @@ vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
return (SET_ERROR(ENOENT));
}
- char *spare_name;
+ const char *spare_name;
error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
if (error != 0)
return (SET_ERROR(EINVAL));
@@ -2673,7 +2710,7 @@ vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
for (int i = 0; i < nspares; i++) {
nvlist_t *spare = spares[i];
uint64_t top_guid, spare_id;
- char *type, *path;
+ const char *type, *path;
/* Skip non-distributed spares */
error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
index 14ebf5514676..acb725696674 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -48,8 +48,8 @@
* "vdev_remap" operation that executes a callback on each contiguous
* segment of the new location. This function is used in multiple ways:
*
- * - i/os to this vdev use the callback to determine where the
- * data is now located, and issue child i/os for each segment's new
+ * - I/Os to this vdev use the callback to determine where the
+ * data is now located, and issue child I/Os for each segment's new
* location.
*
* - frees and claims to this vdev use the callback to free or claim
@@ -172,7 +172,7 @@
* object.
*/
-int zfs_condense_indirect_vdevs_enable = B_TRUE;
+static int zfs_condense_indirect_vdevs_enable = B_TRUE;
/*
* Condense if at least this percent of the bytes in the mapping is
@@ -181,7 +181,7 @@ int zfs_condense_indirect_vdevs_enable = B_TRUE;
* condenses. Higher values will condense less often (causing less
* i/o); lower values will reduce the mapping size more quickly.
*/
-int zfs_condense_indirect_obsolete_pct = 25;
+static uint_t zfs_condense_indirect_obsolete_pct = 25;
/*
* Condense if the obsolete space map takes up more than this amount of
@@ -189,14 +189,14 @@ int zfs_condense_indirect_obsolete_pct = 25;
* consumed by the obsolete space map; the default of 1GB is small enough
* that we typically don't mind "wasting" it.
*/
-unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
+static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
/*
* Don't bother condensing if the mapping uses less than this amount of
* memory. The default of 128KB is considered a "trivial" amount of
* memory and not worth reducing.
*/
-unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
+static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
/*
* This is used by the test suite so that it can ensure that certain
@@ -204,7 +204,7 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
* complete too quickly). If used to reduce the performance impact of
* condensing in production, a maximum value of 1 should be sufficient.
*/
-int zfs_condense_indirect_commit_entry_delay_ms = 0;
+static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0;
/*
* If an indirect split block contains more than this many possible unique
@@ -214,7 +214,7 @@ int zfs_condense_indirect_commit_entry_delay_ms = 0;
* copies to participate fairly in the reconstruction when all combinations
* cannot be checked and prevents repeated use of one bad copy.
*/
-int zfs_reconstruct_indirect_combinations_max = 4096;
+uint_t zfs_reconstruct_indirect_combinations_max = 4096;
/*
* Enable to simulate damaged segments and validate reconstruction. This
@@ -270,7 +270,7 @@ typedef struct indirect_split {
*/
indirect_child_t *is_good_child;
- indirect_child_t is_child[1]; /* variable-length */
+ indirect_child_t is_child[];
} indirect_split_t;
/*
@@ -293,17 +293,16 @@ vdev_indirect_map_free(zio_t *zio)
indirect_vsd_t *iv = zio->io_vsd;
indirect_split_t *is;
- while ((is = list_head(&iv->iv_splits)) != NULL) {
+ while ((is = list_remove_head(&iv->iv_splits)) != NULL) {
for (int c = 0; c < is->is_children; c++) {
indirect_child_t *ic = &is->is_child[c];
if (ic->ic_data != NULL)
abd_free(ic->ic_data);
}
- list_remove(&iv->iv_splits, is);
indirect_child_t *ic;
- while ((ic = list_head(&is->is_unique_child)) != NULL)
- list_remove(&is->is_unique_child, ic);
+ while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
+ ;
list_destroy(&is->is_unique_child);
@@ -637,16 +636,15 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd,
}
}
-/* ARGSUSED */
static boolean_t
spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
{
+ (void) zthr;
spa_t *spa = arg;
return (spa->spa_condensing_indirect != NULL);
}
-/* ARGSUSED */
static void
spa_condense_indirect_thread(void *arg, zthr_t *zthr)
{
@@ -941,13 +939,12 @@ vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise)
return (error);
}
-/* ARGSUSED */
static void
vdev_indirect_close(vdev_t *vd)
{
+ (void) vd;
}
-/* ARGSUSED */
static int
vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
@@ -1023,7 +1020,7 @@ vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
size_t copy_length = entries * sizeof (*first_mapping);
duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
- bcopy(first_mapping, duplicate_mappings, copy_length);
+ memcpy(duplicate_mappings, first_mapping, copy_length);
*copied_entries = entries;
return (duplicate_mappings);
@@ -1321,6 +1318,7 @@ vdev_indirect_io_start(zio_t *zio)
vdev_indirect_gather_splits, zio);
indirect_split_t *first = list_head(&iv->iv_splits);
+ ASSERT3P(first, !=, NULL);
if (first->is_size == zio->io_size) {
/*
* This is not a split block; we are pointing to the entire
@@ -1371,9 +1369,10 @@ vdev_indirect_io_start(zio_t *zio)
is != NULL; is = list_next(&iv->iv_splits, is)) {
zio_nowait(zio_vdev_child_io(zio, NULL,
is->is_vdev, is->is_target_offset,
- abd_get_offset(zio->io_abd,
- is->is_split_offset), is->is_size,
- zio->io_type, zio->io_priority, 0,
+ abd_get_offset_size(zio->io_abd,
+ is->is_split_offset, is->is_size),
+ is->is_size, zio->io_type,
+ zio->io_priority, 0,
vdev_indirect_child_io_done, zio));
}
@@ -1399,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio,
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
- zio_bad_cksum_t zbc = {{{ 0 }}};
+ zio_bad_cksum_t zbc = { 0 };
abd_t *bad_abd = ic->ic_data;
abd_t *good_abd = is->is_good_child->ic_data;
(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
@@ -1480,12 +1479,12 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
vdev_t *vd = ic->ic_vdev;
- (void) zfs_ereport_post_checksum(zio->io_spa, vd,
- NULL, zio, is->is_target_offset, is->is_size,
- NULL, NULL, NULL);
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
+ (void) zfs_ereport_post_checksum(zio->io_spa, vd,
+ NULL, zio, is->is_target_offset, is->is_size,
+ NULL, NULL, NULL);
}
}
}
@@ -1659,8 +1658,8 @@ out:
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
indirect_child_t *ic;
- while ((ic = list_head(&is->is_unique_child)) != NULL)
- list_remove(&is->is_unique_child, ic);
+ while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
+ ;
is->is_unique_children = 0;
}
@@ -1885,23 +1884,28 @@ EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
EXPORT_SYMBOL(vdev_obsolete_sm_object);
/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW,
- "Whether to attempt condensing indirect vdev mappings");
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT,
+ ZMOD_RW, "Whether to attempt condensing indirect vdev mappings");
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, INT, ZMOD_RW,
- "Minimum obsolete percent of bytes in the mapping to attempt condensing");
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT,
+ ZMOD_RW,
+ "Minimum obsolete percent of bytes in the mapping "
+ "to attempt condensing");
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW,
"Don't bother condensing if the mapping uses less than this amount of "
"memory");
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64,
+ ZMOD_RW,
"Minimum size obsolete spacemap to attempt condensing");
-ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms,
+ UINT, ZMOD_RW,
"Used by tests to ensure certain actions happen in the middle of a "
"condense. A maximum value of 1 should be sufficient.");
-ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max,
+ UINT, ZMOD_RW,
"Maximum number of combinations when reconstructing split segments");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
index 99b83c392257..65a57e73604f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
@@ -38,6 +38,8 @@ vdev_indirect_births_verify(vdev_indirect_births_t *vib)
return (B_TRUE);
}
+#else
+#define vdev_indirect_births_verify(vib) ((void) sizeof (vib), B_TRUE)
#endif
uint64_t
@@ -150,7 +152,7 @@ vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
new_entries = vmem_alloc(new_size, KM_SLEEP);
if (old_size > 0) {
- bcopy(vib->vib_entries, new_entries, old_size);
+ memcpy(new_entries, vib->vib_entries, old_size);
vmem_free(vib->vib_entries, old_size);
}
new_entries[vib->vib_phys->vib_count - 1] = vibe;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
index bb484a401b1b..e92495f2dd34 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
@@ -54,6 +54,8 @@ vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
return (B_TRUE);
}
+#else
+#define vdev_indirect_mapping_verify(vim) ((void) sizeof (vim), B_TRUE)
#endif
uint64_t
@@ -480,7 +482,7 @@ vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
vim->vim_entries = vmem_alloc(new_size, KM_SLEEP);
if (old_size > 0) {
- bcopy(old_entries, vim->vim_entries, old_size);
+ memcpy(vim->vim_entries, old_entries, old_size);
vmem_free(old_entries, old_size);
}
VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
@@ -582,7 +584,7 @@ vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
0, counts_size,
counts, DMU_READ_PREFETCH));
} else {
- bzero(counts, counts_size);
+ memset(counts, 0, counts_size);
}
return (counts);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
index e9156c32f384..0a7323f58df2 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@@ -36,23 +36,20 @@
/*
* Value that is written to disk during initialization.
*/
-#ifdef _ILP32
-unsigned long zfs_initialize_value = 0xdeadbeefUL;
-#else
-unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
-#endif
+static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
/* maximum number of I/Os outstanding per leaf vdev */
-int zfs_initialize_limit = 1;
+static const int zfs_initialize_limit = 1;
/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
-unsigned long zfs_initialize_chunk_size = 1024 * 1024;
+static uint64_t zfs_initialize_chunk_size = 1024 * 1024;
static boolean_t
vdev_initialize_should_stop(vdev_t *vd)
{
return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
- vd->vdev_detached || vd->vdev_top->vdev_removing);
+ vd->vdev_detached || vd->vdev_top->vdev_removing ||
+ vd->vdev_top->vdev_rz_expanding);
}
static void
@@ -71,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
kmem_free(arg, sizeof (uint64_t));
vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
- if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ if (vd == NULL || vd->vdev_top->vdev_removing ||
+ !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
return;
uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
@@ -101,6 +99,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
}
static void
+vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx)
+{
+ uint64_t guid = *(uint64_t *)arg;
+
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE);
+ ASSERT3U(vd->vdev_leaf_zap, !=, 0);
+
+ vd->vdev_initialize_last_offset = 0;
+ vd->vdev_initialize_action_time = 0;
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ int error;
+
+ error = zap_remove(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx);
+ VERIFY(error == 0 || error == ENOENT);
+
+ error = zap_remove(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_STATE, tx);
+ VERIFY(error == 0 || error == ENOENT);
+
+ error = zap_remove(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx);
+ VERIFY(error == 0 || error == ENOENT);
+}
+
+static void
vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
{
ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
@@ -127,8 +158,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
- dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
- guid, tx);
+
+ if (new_state != VDEV_INITIALIZE_NONE) {
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_update_sync, guid, tx);
+ } else {
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_remove_sync, guid, tx);
+ }
switch (new_state) {
case VDEV_INITIALIZE_ACTIVE:
@@ -149,6 +186,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
spa_history_log_internal(spa, "initialize", tx,
"vdev=%s complete", vd->vdev_path);
break;
+ case VDEV_INITIALIZE_NONE:
+ spa_history_log_internal(spa, "uninitialize", tx,
+ "vdev=%s", vd->vdev_path);
+ break;
default:
panic("invalid state %llu", (unsigned long long)new_state);
}
@@ -255,20 +296,15 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
* divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
* allocation will guarantee these for us.
*/
-/* ARGSUSED */
static int
vdev_initialize_block_fill(void *buf, size_t len, void *unused)
{
+ (void) unused;
+
ASSERT0(len % sizeof (uint64_t));
-#ifdef _ILP32
- for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) {
- *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value;
- }
-#else
for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
}
-#endif
return (0);
}
@@ -487,7 +523,7 @@ vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
}
-static void
+static __attribute__((noreturn)) void
vdev_initialize_thread(void *arg)
{
vdev_t *vd = arg;
@@ -597,6 +633,7 @@ vdev_initialize(vdev_t *vd)
ASSERT(!vd->vdev_detached);
ASSERT(!vd->vdev_initialize_exit_wanted);
ASSERT(!vd->vdev_top->vdev_removing);
+ ASSERT(!vd->vdev_top->vdev_rz_expanding);
vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
vd->vdev_initialize_thread = thread_create(NULL, 0,
@@ -604,6 +641,24 @@ vdev_initialize(vdev_t *vd)
}
/*
+ * Uninitializes a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_uninitialize(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_initialize_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE);
+}
+
+/*
* Wait for the initialize thread to be terminated (cancelled or stopped).
*/
static void
@@ -624,9 +679,11 @@ vdev_initialize_stop_wait_impl(vdev_t *vd)
void
vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
{
+ (void) spa;
vdev_t *vd;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_initialize_lock);
@@ -668,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
if (vd_list == NULL) {
vdev_initialize_stop_wait_impl(vd);
} else {
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@@ -700,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
spa_t *spa = vd->vdev_spa;
list_t vd_list;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_initialize_node));
@@ -719,7 +778,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
void
vdev_initialize_restart(vdev_t *vd)
{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ vd->vdev_spa->spa_load_thread == curthread);
ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
if (vd->vdev_leaf_zap != 0) {
@@ -738,13 +798,14 @@ vdev_initialize_restart(vdev_t *vd)
ASSERT(err == 0 || err == ENOENT);
vd->vdev_initialize_action_time = timestamp;
- if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
- vd->vdev_offline) {
+ if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
/* load progress for reporting, but don't resume */
VERIFY0(vdev_initialize_load(vd));
} else if (vd->vdev_initialize_state ==
VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
!vd->vdev_top->vdev_removing &&
+ !vd->vdev_top->vdev_rz_expanding &&
vd->vdev_initialize_thread == NULL) {
vdev_initialize(vd);
}
@@ -758,15 +819,14 @@ vdev_initialize_restart(vdev_t *vd)
}
EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_uninitialize);
EXPORT_SYMBOL(vdev_initialize_stop);
EXPORT_SYMBOL(vdev_initialize_stop_all);
EXPORT_SYMBOL(vdev_initialize_stop_wait);
EXPORT_SYMBOL(vdev_initialize_restart);
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW,
"Value written during zpool initialize");
-ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW,
"Size in bytes of writes by zpool initialize");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index daf53f0a0c8b..ed592514fded 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -142,6 +142,7 @@
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
@@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
sizeof (pcs) / sizeof (uint64_t));
}
+
+ pool_raidz_expand_stat_t pres;
+ if (spa_raidz_expand_get_stats(spa, &pres) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres,
+ sizeof (pres) / sizeof (uint64_t));
+ }
}
static void
@@ -486,6 +494,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_isspare)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+ if (flags & VDEV_CONFIG_L2CACHE)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
@@ -496,7 +507,16 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
vd->vdev_asize);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
- if (vd->vdev_removing) {
+ if (vd->vdev_noalloc) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
+ vd->vdev_noalloc);
+ }
+
+ /*
+ * Slog devices are removed synchronously so don't
+ * persist the vdev_removing flag to the label.
+ */
+ if (vd->vdev_removing && !vd->vdev_islog) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
vd->vdev_removing);
}
@@ -564,6 +584,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_top_zap);
}
+ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 &&
+ spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
+ vd->vdev_root_zap);
+ }
+
if (vd->vdev_resilver_deferred) {
ASSERT(vd->vdev_ops->vdev_op_leaf);
ASSERT(spa->spa_resilver_deferred);
@@ -640,35 +666,22 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (!vd->vdev_ops->vdev_op_leaf) {
nvlist_t **child;
- int c, idx;
+ uint64_t c;
ASSERT(!vd->vdev_ishole);
child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
KM_SLEEP);
- for (c = 0, idx = 0; c < vd->vdev_children; c++) {
- vdev_t *cvd = vd->vdev_child[c];
-
- /*
- * If we're generating an nvlist of removing
- * vdevs then skip over any device which is
- * not being removed.
- */
- if ((flags & VDEV_CONFIG_REMOVING) &&
- !cvd->vdev_removing)
- continue;
-
- child[idx++] = vdev_config_generate(spa, cvd,
+ for (c = 0; c < vd->vdev_children; c++) {
+ child[c] = vdev_config_generate(spa, vd->vdev_child[c],
getstats, flags);
}
- if (idx) {
- fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
- child, idx);
- }
+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ (const nvlist_t * const *)child, vd->vdev_children);
- for (c = 0; c < idx; c++)
+ for (c = 0; c < vd->vdev_children; c++)
nvlist_free(child[c]);
kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
@@ -1018,6 +1031,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
int error;
uint64_t spare_guid = 0, l2cache_guid = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason ==
+ VDEV_LABEL_REMOVE && vd->vdev_isspare));
+ boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason ==
+ VDEV_LABEL_REMOVE && vd->vdev_isl2cache));
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
@@ -1103,36 +1120,58 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
* really part of an active pool just yet. The labels will
* be written again with a meaningful txg by spa_sync().
*/
- if (reason == VDEV_LABEL_SPARE ||
- (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+ if (reason_spare || reason_l2cache) {
/*
- * For inactive hot spares, we generate a special label that
- * identifies as a mutually shared hot spare. We write the
- * label if we are adding a hot spare, or if we are removing an
- * active hot spare (in which case we want to revert the
- * labels).
+ * For inactive hot spares and level 2 ARC devices, we generate
+ * a special label that identifies as a mutually shared hot
+ * spare or l2cache device. We write the label in case of
+ * addition or removal of hot spare or l2cache vdev (in which
+ * case we want to revert the labels).
*/
VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
spa_version(spa)) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
- POOL_STATE_SPARE) == 0);
+ reason_spare ? POOL_STATE_SPARE : POOL_STATE_L2CACHE) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
- } else if (reason == VDEV_LABEL_L2CACHE ||
- (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+
/*
- * For level 2 ARC devices, add a special label.
+ * This is merely to facilitate reporting the ashift of the
+ * cache device through zdb. The actual retrieval of the
+ * ashift (in vdev_alloc()) uses the nvlist
+ * spa->spa_l2cache->sav_config (populated in
+ * spa_ld_open_aux_vdevs()).
*/
- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ if (reason_l2cache) {
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT,
+ vd->vdev_ashift) == 0);
+ }
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
- spa_version(spa)) == 0);
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
- POOL_STATE_L2CACHE) == 0);
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
- vd->vdev_guid) == 0);
+ /*
+ * Add path information to help find it during pool import
+ */
+ if (vd->vdev_path != NULL) {
+ VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PATH,
+ vd->vdev_path) == 0);
+ }
+ if (vd->vdev_devid != NULL) {
+ VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_DEVID,
+ vd->vdev_devid) == 0);
+ }
+ if (vd->vdev_physpath != NULL) {
+ VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH,
+ vd->vdev_physpath) == 0);
+ }
+
+ /*
+ * When spare or l2cache (aux) vdev is added during pool
+ * creation, spa->spa_uberblock is not written until this
+ * point. Write it on next config sync.
+ */
+ if (uberblock_verify(&spa->spa_uberblock))
+ spa->spa_aux_sync_uber = B_TRUE;
} else {
uint64_t txg = 0ULL;
@@ -1164,8 +1203,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
* Initialize uberblock template.
*/
ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
- abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+ abd_zero_off(ub_abd, sizeof (uberblock_t),
+ VDEV_UBERBLOCK_RING - sizeof (uberblock_t));
ub = abd_to_buf(ub_abd);
ub->ub_txg = 0;
@@ -1320,7 +1360,7 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
nvlist_free(config);
break;
}
- fallthrough;
+ zfs_fallthrough;
default:
/* Check for FreeBSD zfs bootonce command string */
buf = abd_to_buf(abd);
@@ -1355,6 +1395,7 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
int error;
size_t nvsize;
char *nvbuf;
+ const char *tmp;
error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
if (error != 0)
@@ -1394,8 +1435,8 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
switch (bootenv->vbe_version) {
case VB_RAW:
- if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) {
- (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize);
+ if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) {
+ (void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize);
}
error = 0;
break;
@@ -1488,7 +1529,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
}
struct ubl_cbdata {
- uberblock_t *ubl_ubbest; /* Best uberblock */
+ uberblock_t ubl_latest; /* Most recent uberblock */
+ uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */
vdev_t *ubl_vd; /* vdev associated with the above */
};
@@ -1505,6 +1547,9 @@ vdev_uberblock_load_done(zio_t *zio)
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock);
+ if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) {
+ cbp->ubl_latest = *ub;
+ }
if (ub->ub_txg <= spa->spa_load_max_txg &&
vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
/*
@@ -1561,11 +1606,11 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
ASSERT(ub);
ASSERT(config);
- bzero(ub, sizeof (uberblock_t));
+ memset(ub, 0, sizeof (uberblock_t));
+ memset(&cb, 0, sizeof (cb));
*config = NULL;
cb.ubl_ubbest = ub;
- cb.ubl_vd = NULL;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
zio = zio_root(spa, NULL, &cb, flags);
@@ -1582,6 +1627,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
"txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
+ if (ub->ub_raidz_reflow_info !=
+ cb.ubl_latest.ub_raidz_reflow_info) {
+ vdev_dbgmsg(cb.ubl_vd,
+ "spa=%s best uberblock (txg=%llu info=0x%llx) "
+ "has different raidz_reflow_info than latest "
+ "uberblock (txg=%llu info=0x%llx)",
+ spa->spa_name,
+ (u_longlong_t)ub->ub_txg,
+ (u_longlong_t)ub->ub_raidz_reflow_info,
+ (u_longlong_t)cb.ubl_latest.ub_txg,
+ (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info);
+ memset(ub, 0, sizeof (uberblock_t));
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ return;
+ }
+
*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
if (*config == NULL && spa->spa_extreme_rewind) {
vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
@@ -1703,13 +1764,29 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
vd->vdev_copy_uberblocks = B_FALSE;
}
+ /*
+ * We chose a slot based on the txg. If this uberblock has a special
+ * RAIDZ expansion state, then it is essentially an update of the
+ * current uberblock (it has the same txg). However, the current
+ * state is committed, so we want to write it to a different slot. If
+ * we overwrote the same slot, and we lose power during the uberblock
+ * write, and the disk does not do single-sector overwrites
+ * atomically (even though it is required to - i.e. we should see
+ * either the old or the new uberblock), then we could lose this
+ * txg's uberblock. Rewinding to the previous txg's uberblock may not
+ * be possible because RAIDZ expansion may have already overwritten
+ * some of the data, so we need the progress indicator in the
+ * uberblock.
+ */
int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
- int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
+ int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) %
+ (VDEV_UBERBLOCK_COUNT(vd) - m);
/* Copy the uberblock_t into the ABD */
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
- abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+ abd_zero_off(ub_abd, sizeof (uberblock_t),
+ VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
for (int l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ub_abd,
@@ -1721,7 +1798,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
}
/* Sync the uberblocks to all vdevs in svd[] */
-static int
+int
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
{
spa_t *spa = svd[0]->vdev_spa;
@@ -1733,6 +1810,16 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
for (int v = 0; v < svdcount; v++)
vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
+ if (spa->spa_aux_sync_uber) {
+ for (int v = 0; v < spa->spa_spares.sav_count; v++) {
+ vdev_uberblock_sync(zio, &good_writes, ub,
+ spa->spa_spares.sav_vdevs[v], flags);
+ }
+ for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
+ vdev_uberblock_sync(zio, &good_writes, ub,
+ spa->spa_l2cache.sav_vdevs[v], flags);
+ }
+ }
(void) zio_wait(zio);
/*
@@ -1747,6 +1834,19 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
zio_flush(zio, svd[v]);
}
}
+ if (spa->spa_aux_sync_uber) {
+ spa->spa_aux_sync_uber = B_FALSE;
+ for (int v = 0; v < spa->spa_spares.sav_count; v++) {
+ if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
+ zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
+ }
+ }
+ for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
+ if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
+ zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
+ }
+ }
+ }
(void) zio_wait(zio);
@@ -1927,6 +2027,7 @@ retry:
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
+ * and multihost protection isn't enabled,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
@@ -1934,7 +2035,8 @@ retry:
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
txg, spa->spa_mmp.mmp_delay);
- if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+ if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
+ !spa_multihost(spa))
return (0);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
index 5eb331046953..102eacb03349 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -35,6 +35,7 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
+#include <sys/zio_checksum.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
@@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void)
*/
typedef struct mirror_child {
vdev_t *mc_vd;
+ abd_t *mc_abd;
uint64_t mc_offset;
int mc_error;
int mc_load;
@@ -121,7 +123,7 @@ typedef struct mirror_map {
mirror_child_t mm_child[];
} mirror_map_t;
-static int vdev_mirror_shift = 21;
+static const int vdev_mirror_shift = 21;
/*
* The load configuration settings below are tuned by default for
@@ -407,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
- *physical_ashift = MAX(*physical_ashift,
- cvd->vdev_physical_ashift);
+ }
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error)
+ continue;
+ *physical_ashift = vdev_best_ashift(*logical_ashift,
+ *physical_ashift, cvd->vdev_physical_ashift);
}
if (numerrors == vd->vdev_children) {
@@ -439,32 +447,6 @@ vdev_mirror_child_done(zio_t *zio)
mc->mc_skipped = 0;
}
-static void
-vdev_mirror_scrub_done(zio_t *zio)
-{
- mirror_child_t *mc = zio->io_private;
-
- if (zio->io_error == 0) {
- zio_t *pio;
- zio_link_t *zl = NULL;
-
- mutex_enter(&zio->io_lock);
- while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
- mutex_enter(&pio->io_lock);
- ASSERT3U(zio->io_size, >=, pio->io_size);
- abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
- mutex_exit(&pio->io_lock);
- }
- mutex_exit(&zio->io_lock);
- }
-
- abd_free(zio->io_abd);
-
- mc->mc_error = zio->io_error;
- mc->mc_tried = 1;
- mc->mc_skipped = 0;
-}
-
/*
* Check the other, lower-index DVAs to see if they're on the same
* vdev as the child we picked. If they are, use them since they
@@ -549,7 +531,7 @@ vdev_mirror_child_select(zio_t *zio)
uint64_t txg = zio->io_txg;
int c, lowest_load;
- ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+ ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
lowest_load = INT_MAX;
mm->mm_preferred_cnt = 0;
@@ -637,16 +619,15 @@ vdev_mirror_io_start(zio_t *zio)
}
if (zio->io_type == ZIO_TYPE_READ) {
- if (zio->io_bp != NULL &&
- (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
/*
- * For scrubbing reads (if we can verify the
- * checksum here, as indicated by io_bp being
- * non-NULL) we need to allocate a read buffer for
- * each child and issue reads to all children. If
- * any child succeeds, it will copy its data into
- * zio->io_data in vdev_mirror_scrub_done.
+ * For scrubbing reads we need to issue reads to all
+ * children. One child can reuse parent buffer, but
+ * for others we have to allocate separate ones to
+ * verify checksums if io_bp is non-NULL, or compare
+ * them in vdev_mirror_io_done() otherwise.
*/
+ boolean_t first = B_TRUE;
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -658,12 +639,15 @@ vdev_mirror_io_start(zio_t *zio)
continue;
}
- zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset,
+ mc->mc_abd = first ? zio->io_abd :
abd_alloc_sametype(zio->io_abd,
- zio->io_size), zio->io_size,
- zio->io_type, zio->io_priority, 0,
- vdev_mirror_scrub_done, mc));
+ zio->io_size);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ mc->mc_vd, mc->mc_offset, mc->mc_abd,
+ zio->io_size, zio->io_type,
+ zio->io_priority, 0,
+ vdev_mirror_child_done, mc));
+ first = B_FALSE;
}
zio_execute(zio);
return;
@@ -731,6 +715,7 @@ vdev_mirror_io_done(zio_t *zio)
int c;
int good_copies = 0;
int unexpected_errors = 0;
+ int last_good_copy = -1;
if (mm == NULL)
return;
@@ -742,6 +727,7 @@ vdev_mirror_io_done(zio_t *zio)
if (!mc->mc_skipped)
unexpected_errors++;
} else if (mc->mc_tried) {
+ last_good_copy = c;
good_copies++;
}
}
@@ -755,7 +741,6 @@ vdev_mirror_io_done(zio_t *zio)
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
- /* XXPOLICY */
if (good_copies != mm->mm_children) {
/*
* Always require at least one good copy.
@@ -782,7 +767,6 @@ vdev_mirror_io_done(zio_t *zio)
/*
* If we don't have a good copy yet, keep trying other children.
*/
- /* XXPOLICY */
if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
ASSERT(c >= 0 && c < mm->mm_children);
mc = &mm->mm_child[c];
@@ -794,7 +778,80 @@ vdev_mirror_io_done(zio_t *zio)
return;
}
- /* XXPOLICY */
+ if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) {
+ abd_t *best_abd = NULL;
+ if (last_good_copy >= 0)
+ best_abd = mm->mm_child[last_good_copy].mc_abd;
+
+ /*
+ * If we're scrubbing but don't have a BP available (because
+ * this vdev is under a raidz or draid vdev) then the best we
+ * can do is compare all of the copies read. If they're not
+ * identical then return a checksum error and the most likely
+ * correct data. The raidz code will issue a repair I/O if
+ * possible.
+ */
+ if (zio->io_bp == NULL) {
+ ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
+ zio->io_vd->vdev_ops == &vdev_spare_ops);
+
+ abd_t *pref_abd = NULL;
+ for (c = 0; c < last_good_copy; c++) {
+ mc = &mm->mm_child[c];
+ if (mc->mc_error || !mc->mc_tried)
+ continue;
+
+ if (abd_cmp(mc->mc_abd, best_abd) != 0)
+ zio->io_error = SET_ERROR(ECKSUM);
+
+ /*
+ * The distributed spare is always prefered
+ * by vdev_mirror_child_select() so it's
+ * considered to be the best candidate.
+ */
+ if (pref_abd == NULL &&
+ mc->mc_vd->vdev_ops ==
+ &vdev_draid_spare_ops)
+ pref_abd = mc->mc_abd;
+
+ /*
+ * In the absence of a preferred copy, use
+ * the parent pointer to avoid a memory copy.
+ */
+ if (mc->mc_abd == zio->io_abd)
+ best_abd = mc->mc_abd;
+ }
+ if (pref_abd)
+ best_abd = pref_abd;
+ } else {
+
+ /*
+ * If we have a BP available, then checksums are
+ * already verified and we just need a buffer
+ * with valid data, preferring parent one to
+ * avoid a memory copy.
+ */
+ for (c = 0; c < last_good_copy; c++) {
+ mc = &mm->mm_child[c];
+ if (mc->mc_error || !mc->mc_tried)
+ continue;
+ if (mc->mc_abd == zio->io_abd) {
+ best_abd = mc->mc_abd;
+ break;
+ }
+ }
+ }
+
+ if (best_abd && best_abd != zio->io_abd)
+ abd_copy(zio->io_abd, best_abd, zio->io_size);
+ for (c = 0; c < mm->mm_children; c++) {
+ mc = &mm->mm_child[c];
+ if (mc->mc_abd != zio->io_abd)
+ abd_free(mc->mc_abd);
+ mc->mc_abd = NULL;
+ }
+ }
+
if (good_copies == 0) {
zio->io_error = vdev_mirror_worst_error(mm);
ASSERT(zio->io_error != 0);
@@ -880,6 +937,8 @@ static uint64_t
vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
uint64_t max_segment)
{
+ (void) start;
+
uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
SPA_MAXBLOCKSIZE);
@@ -961,20 +1020,21 @@ vdev_ops_t vdev_spare_ops = {
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
- "Rotating media load increment for non-seeking I/O's");
+ "Rotating media load increment for non-seeking I/Os");
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW,
- "Rotating media load increment for seeking I/O's");
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT,
+ ZMOD_RW, "Rotating media load increment for seeking I/Os");
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT,
+ ZMOD_RW,
"Offset in bytes from the last I/O which triggers "
"a reduced rotating media seek increment");
+/* END CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW,
- "Non-rotating media load increment for non-seeking I/O's");
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT,
+ ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os");
-ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW,
- "Non-rotating media load increment for seeking I/O's");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT,
+ ZMOD_RW, "Non-rotating media load increment for seeking I/Os");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c
index e9145fd012d7..d3580882c3e0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_missing.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -42,7 +42,6 @@
#include <sys/fs/zfs.h>
#include <sys/zio.h>
-/* ARGSUSED */
static int
vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift, uint64_t *pshift)
@@ -53,6 +52,7 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
* VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we
* will fail the GUID sum check before ever trying to open the pool.
*/
+ (void) vd;
*psize = 0;
*max_psize = 0;
*ashift = 0;
@@ -60,13 +60,12 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
return (0);
}
-/* ARGSUSED */
static void
vdev_missing_close(vdev_t *vd)
{
+ (void) vd;
}
-/* ARGSUSED */
static void
vdev_missing_io_start(zio_t *zio)
{
@@ -74,10 +73,10 @@ vdev_missing_io_start(zio_t *zio)
zio_execute(zio);
}
-/* ARGSUSED */
static void
vdev_missing_io_done(zio_t *zio)
{
+ (void) zio;
}
vdev_ops_t vdev_missing_ops = {
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
index cc5b15b8c028..092b3f375be0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -121,7 +121,7 @@
* The maximum number of i/os active to each device. Ideally, this will be >=
* the sum of each queue's max_active.
*/
-uint32_t zfs_vdev_max_active = 1000;
+uint_t zfs_vdev_max_active = 1000;
/*
* Per-queue limits on the number of i/os active to each device. If the
@@ -141,24 +141,24 @@ uint32_t zfs_vdev_max_active = 1000;
* more quickly, but reads and writes to have higher latency and lower
* throughput.
*/
-uint32_t zfs_vdev_sync_read_min_active = 10;
-uint32_t zfs_vdev_sync_read_max_active = 10;
-uint32_t zfs_vdev_sync_write_min_active = 10;
-uint32_t zfs_vdev_sync_write_max_active = 10;
-uint32_t zfs_vdev_async_read_min_active = 1;
-uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 2;
-uint32_t zfs_vdev_async_write_max_active = 10;
-uint32_t zfs_vdev_scrub_min_active = 1;
-uint32_t zfs_vdev_scrub_max_active = 3;
-uint32_t zfs_vdev_removal_min_active = 1;
-uint32_t zfs_vdev_removal_max_active = 2;
-uint32_t zfs_vdev_initializing_min_active = 1;
-uint32_t zfs_vdev_initializing_max_active = 1;
-uint32_t zfs_vdev_trim_min_active = 1;
-uint32_t zfs_vdev_trim_max_active = 2;
-uint32_t zfs_vdev_rebuild_min_active = 1;
-uint32_t zfs_vdev_rebuild_max_active = 3;
+static uint_t zfs_vdev_sync_read_min_active = 10;
+static uint_t zfs_vdev_sync_read_max_active = 10;
+static uint_t zfs_vdev_sync_write_min_active = 10;
+static uint_t zfs_vdev_sync_write_max_active = 10;
+static uint_t zfs_vdev_async_read_min_active = 1;
+/* */ uint_t zfs_vdev_async_read_max_active = 3;
+static uint_t zfs_vdev_async_write_min_active = 2;
+/* */ uint_t zfs_vdev_async_write_max_active = 10;
+static uint_t zfs_vdev_scrub_min_active = 1;
+static uint_t zfs_vdev_scrub_max_active = 3;
+static uint_t zfs_vdev_removal_min_active = 1;
+static uint_t zfs_vdev_removal_max_active = 2;
+static uint_t zfs_vdev_initializing_min_active = 1;
+static uint_t zfs_vdev_initializing_max_active = 1;
+static uint_t zfs_vdev_trim_min_active = 1;
+static uint_t zfs_vdev_trim_max_active = 2;
+static uint_t zfs_vdev_rebuild_min_active = 1;
+static uint_t zfs_vdev_rebuild_max_active = 3;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -167,8 +167,8 @@ uint32_t zfs_vdev_rebuild_max_active = 3;
* zfs_vdev_async_write_max_active. The value is linearly interpolated
* between min and max.
*/
-int zfs_vdev_async_write_active_min_dirty_percent = 30;
-int zfs_vdev_async_write_active_max_dirty_percent = 60;
+uint_t zfs_vdev_async_write_active_min_dirty_percent = 30;
+uint_t zfs_vdev_async_write_active_max_dirty_percent = 60;
/*
* For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
@@ -178,7 +178,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
* interactive I/O, then the vdev is considered to be "idle", and the number
* of concurrently-active non-interactive I/O's is increased to *_max_active.
*/
-uint_t zfs_vdev_nia_delay = 5;
+static uint_t zfs_vdev_nia_delay = 5;
/*
* Some HDDs tend to prioritize sequential I/O so high that concurrent
@@ -190,7 +190,7 @@ uint_t zfs_vdev_nia_delay = 5;
* I/Os. This enforced wait ensures the HDD services the interactive I/O
* within a reasonable amount of time.
*/
-uint_t zfs_vdev_nia_credit = 5;
+static uint_t zfs_vdev_nia_credit = 5;
/*
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
@@ -198,10 +198,10 @@ uint_t zfs_vdev_nia_credit = 5;
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
-int zfs_vdev_aggregation_limit = 1 << 20;
-int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
-int zfs_vdev_read_gap_limit = 32 << 10;
-int zfs_vdev_write_gap_limit = 4 << 10;
+static uint_t zfs_vdev_aggregation_limit = 1 << 20;
+static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
+static uint_t zfs_vdev_read_gap_limit = 32 << 10;
+static uint_t zfs_vdev_write_gap_limit = 4 << 10;
/*
* Define the queue depth percentage for each top-level. This percentage is
@@ -214,9 +214,9 @@ int zfs_vdev_write_gap_limit = 4 << 10;
* to 30 allocations per device.
*/
#ifdef _KERNEL
-int zfs_vdev_queue_depth_pct = 1000;
+uint_t zfs_vdev_queue_depth_pct = 1000;
#else
-int zfs_vdev_queue_depth_pct = 300;
+uint_t zfs_vdev_queue_depth_pct = 300;
#endif
/*
@@ -226,14 +226,7 @@ int zfs_vdev_queue_depth_pct = 300;
* we assume that the average allocation size is 4k, so we need the queue depth
* to be 32 per allocator to get good aggregation of sequential writes.
*/
-int zfs_vdev_def_queue_depth = 32;
-
-/*
- * Allow TRIM I/Os to be aggregated. This should normally not be needed since
- * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
- * by the TRIM code in zfs_trim.c.
- */
-int zfs_vdev_aggregate_trim = 0;
+uint_t zfs_vdev_def_queue_depth = 32;
static int
vdev_queue_offset_compare(const void *x1, const void *x2)
@@ -249,39 +242,64 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (TREE_PCMP(z1, z2));
}
-static inline avl_tree_t *
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
-{
- return (&vq->vq_class[p].vqc_queued_tree);
-}
-
-static inline avl_tree_t *
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
-{
- ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
- if (t == ZIO_TYPE_READ)
- return (&vq->vq_read_offset_tree);
- else if (t == ZIO_TYPE_WRITE)
- return (&vq->vq_write_offset_tree);
- else
- return (&vq->vq_trim_offset_tree);
-}
+#define VDQ_T_SHIFT 29
static int
-vdev_queue_timestamp_compare(const void *x1, const void *x2)
+vdev_queue_to_compare(const void *x1, const void *x2)
{
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
- int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+ int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
+ z2->io_timestamp >> VDQ_T_SHIFT);
+ int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
+ int cmp = tcmp ? tcmp : ocmp;
- if (likely(cmp))
+ if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
return (cmp);
return (TREE_PCMP(z1, z2));
}
-static int
+static inline boolean_t
+vdev_queue_class_fifo(zio_priority_t p)
+{
+ return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
+ p == ZIO_PRIORITY_TRIM);
+}
+
+static void
+vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_priority_t p = zio->io_priority;
+ vq->vq_cqueued |= 1U << p;
+ if (vdev_queue_class_fifo(p)) {
+ list_insert_tail(&vq->vq_class[p].vqc_list, zio);
+ vq->vq_class[p].vqc_list_numnodes++;
+ }
+ else
+ avl_add(&vq->vq_class[p].vqc_tree, zio);
+}
+
+static void
+vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ zio_priority_t p = zio->io_priority;
+ uint32_t empty;
+ if (vdev_queue_class_fifo(p)) {
+ list_t *list = &vq->vq_class[p].vqc_list;
+ list_remove(list, zio);
+ empty = list_is_empty(list);
+ vq->vq_class[p].vqc_list_numnodes--;
+ } else {
+ avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
+ avl_remove(tree, zio);
+ empty = avl_is_empty(tree);
+ }
+ vq->vq_cqueued &= ~(empty << p);
+}
+
+static uint_t
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
@@ -313,10 +331,10 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
}
}
-static int
+static uint_t
vdev_queue_max_async_writes(spa_t *spa)
{
- int writes;
+ uint_t writes;
uint64_t dirty = 0;
dsl_pool_t *dp = spa_get_dsl(spa);
uint64_t min_bytes = zfs_dirty_data_max *
@@ -359,8 +377,8 @@ vdev_queue_max_async_writes(spa_t *spa)
return (writes);
}
-static int
-vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+static uint_t
+vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@@ -370,7 +388,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE:
- return (vdev_queue_max_async_writes(spa));
+ return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
@@ -408,16 +426,16 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
}
/*
- * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * Return the i/o class to issue from, or ZIO_PRIORITY_NUM_QUEUEABLE if
* there is no eligible class.
*/
static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
- spa_t *spa = vq->vq_vdev->vdev_spa;
- zio_priority_t p, n;
+ uint32_t cq = vq->vq_cqueued;
+ zio_priority_t p, p1;
- if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+ if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
/*
@@ -425,14 +443,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits.
*/
- for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
- p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
- vq->vq_class[p].vqc_active <
- vdev_queue_class_min_active(vq, p)) {
- vq->vq_last_prio = p;
- return (p);
- }
+ p1 = vq->vq_last_prio + 1;
+ if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
+ p1 = 0;
+ for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_min_active(vq, p))
+ goto found;
+ }
+ for (p = 0; p < p1; p++) {
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_min_active(vq, p))
+ goto found;
}
/*
@@ -440,16 +462,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
- vq->vq_class[p].vqc_active <
- vdev_queue_class_max_active(spa, vq, p)) {
- vq->vq_last_prio = p;
- return (p);
- }
+ if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
+ vdev_queue_class_max_active(vq, p))
+ break;
}
- /* No eligible queued i/os */
- return (ZIO_PRIORITY_NUM_QUEUEABLE);
+found:
+ vq->vq_last_prio = p;
+ return (p);
}
void
@@ -458,42 +478,30 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue;
zio_priority_t p;
- mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd;
- taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
-
- avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_queue_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
- vdev_queue_offset_compare, sizeof (zio_t),
- offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
- int (*compfn) (const void *, const void *);
-
- /*
- * The synchronous/trim i/o queues are dispatched in FIFO rather
- * than LBA order. This provides more consistent latency for
- * these i/os.
- */
- if (p == ZIO_PRIORITY_SYNC_READ ||
- p == ZIO_PRIORITY_SYNC_WRITE ||
- p == ZIO_PRIORITY_TRIM) {
- compfn = vdev_queue_timestamp_compare;
+ if (vdev_queue_class_fifo(p)) {
+ list_create(&vq->vq_class[p].vqc_list,
+ sizeof (zio_t),
+ offsetof(struct zio, io_queue_node.l));
} else {
- compfn = vdev_queue_offset_compare;
+ avl_create(&vq->vq_class[p].vqc_tree,
+ vdev_queue_to_compare, sizeof (zio_t),
+ offsetof(struct zio, io_queue_node.a));
}
- avl_create(vdev_queue_class_tree(vq, p), compfn,
- sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
+ avl_create(&vq->vq_read_offset_tree,
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(&vq->vq_write_offset_tree,
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
vq->vq_last_offset = 0;
+ list_create(&vq->vq_active_list, sizeof (struct zio),
+ offsetof(struct zio, io_queue_node.l));
+ mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
}
void
@@ -501,30 +509,39 @@ vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
- for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
- avl_destroy(vdev_queue_class_tree(vq, p));
- avl_destroy(&vq->vq_active_tree);
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (vdev_queue_class_fifo(p))
+ list_destroy(&vq->vq_class[p].vqc_list);
+ else
+ avl_destroy(&vq->vq_class[p].vqc_tree);
+ }
+ avl_destroy(&vq->vq_read_offset_tree);
+ avl_destroy(&vq->vq_write_offset_tree);
+ list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
- avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+ zio->io_queue_state = ZIO_QS_QUEUED;
+ vdev_queue_class_add(vq, zio);
+ if (zio->io_type == ZIO_TYPE_READ)
+ avl_add(&vq->vq_read_offset_tree, zio);
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ avl_add(&vq->vq_write_offset_tree, zio);
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
- ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
- avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+ vdev_queue_class_remove(vq, zio);
+ if (zio->io_type == ZIO_TYPE_READ)
+ avl_remove(&vq->vq_read_offset_tree, zio);
+ else if (zio->io_type == ZIO_TYPE_WRITE)
+ avl_remove(&vq->vq_write_offset_tree, zio);
+ zio->io_queue_state = ZIO_QS_NONE;
}
static boolean_t
@@ -546,14 +563,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_class[zio->io_priority].vqc_active++;
+ vq->vq_cactive[zio->io_priority]++;
+ vq->vq_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--;
}
- avl_add(&vq->vq_active_tree, zio);
+ zio->io_queue_state = ZIO_QS_ACTIVE;
+ list_insert_tail(&vq->vq_active_list, zio);
}
static void
@@ -561,7 +580,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
- vq->vq_class[zio->io_priority].vqc_active--;
+ vq->vq_cactive[zio->io_priority]--;
+ vq->vq_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
@@ -569,7 +589,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
- avl_remove(&vq->vq_active_tree, zio);
+ list_remove(&vq->vq_active_list, zio);
+ zio->io_queue_state = ZIO_QS_NONE;
}
static void
@@ -602,29 +623,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0;
uint64_t size;
uint64_t limit;
- int maxblocksize;
boolean_t stretch = B_FALSE;
- avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
- enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
uint64_t next_offset;
abd_t *abd;
+ avl_tree_t *t;
+
+ /*
+ * TRIM aggregation should not be needed since code in zfs_trim.c can
+ * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
+ */
+ if (zio->io_type == ZIO_TYPE_TRIM)
+ return (NULL);
+
+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
+ return (NULL);
- maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
if (vq->vq_vdev->vdev_nonrot)
limit = zfs_vdev_aggregation_limit_non_rotating;
else
limit = zfs_vdev_aggregation_limit;
- limit = MAX(MIN(limit, maxblocksize), 0);
-
- if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
- return (NULL);
-
- /*
- * While TRIM commands could be aggregated based on offset this
- * behavior is disabled until it's determined to be beneficial.
- */
- if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+ if (limit == 0)
return (NULL);
+ limit = MIN(limit, SPA_MAXBLOCKSIZE);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
@@ -635,8 +655,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
first = last = zio;
- if (zio->io_type == ZIO_TYPE_READ)
+ if (zio->io_type == ZIO_TYPE_READ) {
maxgap = zfs_vdev_read_gap_limit;
+ t = &vq->vq_read_offset_tree;
+ } else {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ t = &vq->vq_write_offset_tree;
+ }
/*
* We can aggregate I/Os that are sufficiently adjacent and of
@@ -657,6 +682,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O.
*/
+ zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= limit &&
@@ -686,7 +712,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
- IO_SPAN(first, dio) <= maxblocksize &&
+ IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
IO_GAP(last, dio) <= maxgap &&
dio->io_type == zio->io_type) {
last = dio;
@@ -725,6 +751,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* after our span is mandatory.
*/
dio = AVL_NEXT(t, last);
+ ASSERT3P(dio, !=, NULL);
dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
} else {
/* do not include the optional i/o */
@@ -739,7 +766,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
return (NULL);
size = IO_SPAN(first, last);
- ASSERT3U(size, <=, maxblocksize);
+ ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
abd = abd_alloc_gang();
if (abd == NULL)
@@ -747,8 +774,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
abd, size, first->io_type, zio->io_priority,
- flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
- vdev_queue_agg_io_done, NULL);
+ flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
nio = first;
@@ -756,6 +782,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
do {
dio = nio;
nio = AVL_NEXT(t, dio);
+ ASSERT3P(dio, !=, NULL);
zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio);
@@ -823,19 +850,30 @@ again:
return (NULL);
}
- /*
- * For LBA-ordered queues (async / scrub / initializing), issue the
- * i/o which follows the most recently issued i/o in LBA (offset) order.
- *
- * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
- */
- tree = vdev_queue_class_tree(vq, p);
- vq->vq_io_search.io_timestamp = 0;
- vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
- VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
- zio = avl_nearest(tree, idx, AVL_AFTER);
- if (zio == NULL)
- zio = avl_first(tree);
+ if (vdev_queue_class_fifo(p)) {
+ zio = list_head(&vq->vq_class[p].vqc_list);
+ } else {
+ /*
+ * For LBA-ordered queues (async / scrub / initializing),
+ * issue the I/O which follows the most recently issued I/O
+ * in LBA (offset) order, but to avoid starvation only within
+ * the same 0.5 second interval as the first I/O.
+ */
+ tree = &vq->vq_class[p].vqc_tree;
+ zio = aio = avl_first(tree);
+ if (zio->io_offset < vq->vq_last_offset) {
+ vq->vq_io_search.io_timestamp = zio->io_timestamp;
+ vq->vq_io_search.io_offset = vq->vq_last_offset;
+ zio = avl_find(tree, &vq->vq_io_search, &idx);
+ if (zio == NULL) {
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ if (zio == NULL ||
+ (zio->io_timestamp >> VDQ_T_SHIFT) !=
+ (aio->io_timestamp >> VDQ_T_SHIFT))
+ zio = aio;
+ }
+ }
+ }
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
@@ -905,7 +943,7 @@ vdev_queue_io(zio_t *zio)
ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
}
- zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+ zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();
mutex_enter(&vq->vq_lock);
@@ -966,7 +1004,6 @@ void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
- avl_tree_t *tree;
/*
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@@ -1001,12 +1038,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
- tree = vdev_queue_class_tree(vq, zio->io_priority);
- if (avl_find(tree, zio, NULL) == zio) {
- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ if (zio->io_queue_state == ZIO_QS_QUEUED) {
+ vdev_queue_class_remove(vq, zio);
zio->io_priority = priority;
- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
- } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+ vdev_queue_class_add(vq, zio);
+ } else if (zio->io_queue_state == ZIO_QS_NONE) {
zio->io_priority = priority;
}
@@ -1019,10 +1055,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* vq_lock mutex use here, instead we prefer to keep it lock free for
* performance.
*/
-int
+uint32_t
vdev_queue_length(vdev_t *vd)
{
- return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+ return (vd->vdev_queue.vq_active);
}
uint64_t
@@ -1031,91 +1067,99 @@ vdev_queue_last_offset(vdev_t *vd)
return (vd->vdev_queue.vq_last_offset);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW,
- "Max vdev I/O aggregation size");
+uint64_t
+vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
+{
+ vdev_queue_t *vq = &vd->vdev_queue;
+ if (vdev_queue_class_fifo(p))
+ return (vq->vq_class[p].vqc_list_numnodes);
+ else
+ return (avl_numnodes(&vq->vq_class[p].vqc_tree));
+}
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW,
- "Max vdev I/O aggregation size for non-rotating media");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
+ "Max vdev I/O aggregation size");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW,
- "Allow TRIM I/O to be aggregated");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
+ ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
"Aggregate read I/O over gap");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, UINT, ZMOD_RW,
"Aggregate write I/O over gap");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, UINT, ZMOD_RW,
"Maximum number of active I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW,
- "Async write concurrency max threshold");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent,
+ UINT, ZMOD_RW, "Async write concurrency max threshold");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW,
- "Async write concurrency min threshold");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent,
+ UINT, ZMOD_RW, "Async write concurrency min threshold");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, UINT, ZMOD_RW,
"Max active async read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, UINT, ZMOD_RW,
"Min active async read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, UINT, ZMOD_RW,
"Max active async write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, UINT, ZMOD_RW,
"Min active async write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, UINT, ZMOD_RW,
"Max active initializing I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, UINT, ZMOD_RW,
"Min active initializing I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, UINT, ZMOD_RW,
"Max active removal I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, UINT, ZMOD_RW,
"Min active removal I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, UINT, ZMOD_RW,
"Max active scrub I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, UINT, ZMOD_RW,
"Min active scrub I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, UINT, ZMOD_RW,
"Max active sync read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, UINT, ZMOD_RW,
"Min active sync read I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, UINT, ZMOD_RW,
"Max active sync write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, UINT, ZMOD_RW,
"Min active sync write I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, UINT, ZMOD_RW,
"Max active trim/discard I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, UINT, ZMOD_RW,
"Min active trim/discard I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, UINT, ZMOD_RW,
"Max active rebuild I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, UINT, ZMOD_RW,
"Min active rebuild I/Os per vdev");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW,
"Number of non-interactive I/Os to allow in sequence");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW,
"Number of non-interactive I/Os before _max_active");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,
"Queue depth percentage for each top-level vdev");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
+ "Default queue depth for each allocator");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index 7e7202ec1e55..15c8b8ca6016 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -27,15 +27,22 @@
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/dmu_tx.h>
#include <sys/abd.h>
+#include <sys/zfs_rlock.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/vdev_raidz.h>
#include <sys/vdev_raidz_impl.h>
#include <sys/vdev_draid.h>
+#include <sys/uberblock_impl.h>
+#include <sys/dsl_scan.h>
#ifdef ZFS_DEBUG
#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
@@ -135,6 +142,237 @@
VDEV_RAIDZ_64MUL_2((x), mask); \
}
+
+/*
+ * Big Theory Statement for how a RAIDZ VDEV is expanded
+ *
+ * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
+ * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
+ * that have been previously expanded can be expanded again.
+ *
+ * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
+ * the VDEV) when an expansion starts. And the expansion will pause if any
+ * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
+ * operations on the pool can continue while an expansion is in progress (e.g.
+ * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
+ * and zpool initialize which can't be run during an expansion. Following a
+ * reboot or export/import, the expansion resumes where it left off.
+ *
+ * == Reflowing the Data ==
+ *
+ * The expansion involves reflowing (copying) the data from the current set
+ * of disks to spread it across the new set which now has one more disk. This
+ * reflow operation is similar to reflowing text when the column width of a
+ * text editor window is expanded. The text doesn’t change but the location of
+ * the text changes to accommodate the new width. An example reflow result for
+ * a 4-wide RAIDZ1 to a 5-wide is shown below.
+ *
+ * Reflow End State
+ * Each letter indicates a parity group (logical stripe)
+ *
+ * Before expansion After Expansion
+ * D1 D2 D3 D4 D1 D2 D3 D4 D5
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | A | A | A | A | | A | A | A | A | B |
+ * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | B | B | C | C | | B | C | C | C | C |
+ * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | C | C | D | D | | D | D | E | E | E |
+ * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | E | E | E | E | --> | E | F | F | G | G |
+ * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | F | F | G | G | | G | G | H | H | H |
+ * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | G | G | H | H | | H | I | I | J | J |
+ * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | H | H | I | I | | J | J | | | K |
+ * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
+ * +------+------+------+------+ +------+------+------+------+------+
+ *
+ * This reflow approach has several advantages. There is no need to read or
+ * modify the block pointers or recompute any block checksums. The reflow
+ * doesn’t need to know where the parity sectors reside. We can read and write
+ * data sequentially and the copy can occur in a background thread in open
+ * context. The design also allows for fast discovery of what data to copy.
+ *
+ * The VDEV metaslabs are processed, one at a time, to copy the block data to
+ * have it flow across all the disks. The metaslab is disabled for allocations
+ * during the copy. As an optimization, we only copy the allocated data which
+ * can be determined by looking at the metaslab range tree. During the copy we
+ * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
+ * need to be able to survive losing parity count disks). This means we
+ * cannot overwrite data during the reflow that would be needed if a disk is
+ * lost.
+ *
+ * After the reflow completes, all newly-written blocks will have the new
+ * layout, i.e., they will have the parity to data ratio implied by the new
+ * number of disks in the RAIDZ group. Even though the reflow copies all of
+ * the allocated space (data and parity), it is only rearranged, not changed.
+ *
+ * This act of reflowing the data has a few implications about blocks
+ * that were written before the reflow completes:
+ *
+ * - Old blocks will still use the same amount of space (i.e., they will have
+ * the parity to data ratio implied by the old number of disks in the RAIDZ
+ * group).
+ * - Reading old blocks will be slightly slower than before the reflow, for
+ * two reasons. First, we will have to read from all disks in the RAIDZ
+ * VDEV, rather than being able to skip the children that contain only
+ * parity of this block (because the data of a single block is now spread
+ * out across all the disks). Second, in most cases there will be an extra
+ * bcopy, needed to rearrange the data back to its original layout in memory.
+ *
+ * == Scratch Area ==
+ *
+ * As we copy the block data, we can only progress to the point that writes
+ * will not overlap with blocks whose progress has not yet been recorded on
+ * disk. Since partially-copied rows are always read from the old location,
+ * we need to stop one row before the sector-wise overlap, to prevent any
+ * row-wise overlap. For example, in the diagram above, when we reflow sector
+ * B6 it will overwite the original location for B5.
+ *
+ * To get around this, a scratch space is used so that we can start copying
+ * without risking data loss by overlapping the row. As an added benefit, it
+ * improves performance at the beginning of the reflow, but that small perf
+ * boost wouldn't be worth the complexity on its own.
+ *
+ * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
+ * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
+ * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
+ * the widths will likely be single digits so we can get a substantial chuck
+ * size using only a few MB of scratch per disk.
+ *
+ * The scratch area is persisted to disk which holds a large amount of reflowed
+ * state. We can always read the partially written stripes when a disk fails or
+ * the copy is interrupted (crash) during the initial copying phase and also
+ * get past a small chunk size restriction. At a minimum, the scratch space
+ * must be large enough to get us to the point that one row does not overlap
+ * itself when moved (i.e new_width^2). But going larger is even better. We
+ * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
+ * as our scratch space to handle overwriting the initial part of the VDEV.
+ *
+ * 0 256K 512K 4M
+ * +------+------+-----------------------+-----------------------------
+ * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
+ * | L0 | L1 | Reserved | (Metaslabs)
+ * +------+------+-----------------------+-------------------------------
+ * Scratch Area
+ *
+ * == Reflow Progress Updates ==
+ * After the initial scratch-based reflow, the expansion process works
+ * similarly to device removal. We create a new open context thread which
+ * reflows the data, and periodically kicks off sync tasks to update logical
+ * state. In this case, state is the committed progress (offset of next data
+ * to copy). We need to persist the completed offset on disk, so that if we
+ * crash we know which format each VDEV offset is in.
+ *
+ * == Time Dependent Geometry ==
+ *
+ * In non-expanded RAIDZ, blocks are read from disk in a column by column
+ * fashion. For a multi-row block, the second sector is in the first column
+ * not in the second column. This allows us to issue full reads for each
+ * column directly into the request buffer. The block data is thus laid out
+ * sequentially in a column-by-column fashion.
+ *
+ * For example, in the before expansion diagram above, one logical block might
+ * be sectors G19-H26. The parity is in G19,H23; and the data is in
+ * G20,H24,G21,H25,G22,H26.
+ *
+ * After a block is reflowed, the sectors that were all in the original column
+ * data can now reside in different columns. When reading from an expanded
+ * VDEV, we need to know the logical stripe width for each block so we can
+ * reconstitute the block’s data after the reads are completed. Likewise,
+ * when we perform the combinatorial reconstruction we need to know the
+ * original width so we can retry combinations from the past layouts.
+ *
+ * Time dependent geometry is what we call having blocks with different layouts
+ * (stripe widths) in the same VDEV. This time-dependent geometry uses the
+ * block’s birth time (+ the time expansion ended) to establish the correct
+ * width for a given block. After an expansion completes, we record the time
+ * for blocks written with a particular width (geometry).
+ *
+ * == On Disk Format Changes ==
+ *
+ * New pool feature flag, 'raidz_expansion' whose reference count is the number
+ * of RAIDZ VDEVs that have been expanded.
+ *
+ * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
+ *
+ * Since the uberblock can point to arbitrary blocks, which might be on the
+ * expanding RAIDZ, and might or might not have been expanded. We need to know
+ * which way a block is laid out before reading it. This info is the next
+ * offset that needs to be reflowed and we persist that in the uberblock, in
+ * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
+ * After the expansion is complete, we then use the raidz_expand_txgs array
+ * (see below) to determine how to read a block and the ub_raidz_reflow_info
+ * field no longer required.
+ *
+ * The uberblock's ub_raidz_reflow_info field also holds the scratch space
+ * state (i.e., active or not) which is also required before reading a block
+ * during the initial phase of reflowing the data.
+ *
+ * The top-level RAIDZ VDEV has two new entries in the nvlist:
+ *
+ * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
+ * and used after the expansion is complete to
+ * determine how to read a raidz block
+ * 'raidz_expanding' boolean: present during reflow and removed after completion
+ * used during a spa import to resume an unfinished
+ * expansion
+ *
+ * And finally the VDEVs top zap adds the following informational entries:
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
+ */
+
+/*
+ * For testing only: pause the raidz expansion after reflowing this amount.
+ * (accessed by ZTS and ztest)
+ */
+#ifdef _KERNEL
+static
+#endif /* _KERNEL */
+unsigned long raidz_expand_max_reflow_bytes = 0;
+
+/*
+ * For testing only: pause the raidz expansion at a certain point.
+ */
+uint_t raidz_expand_pause_point = 0;
+
+/*
+ * Maximum amount of copy io's outstanding at once.
+ */
+static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
+
+/*
+ * Apply raidz map abds aggregation if the number of rows in the map is equal
+ * or greater than the value below.
+ */
+static unsigned long raidz_io_aggregate_rows = 4;
+
+/*
+ * Automatically start a pool scrub when a RAIDZ expansion completes in
+ * order to verify the checksums of all blocks which have been copied
+ * during the expansion. Automatic scrubbing is enabled by default and
+ * is strongly recommended.
+ */
+static int zfs_scrub_after_expand = 1;
+
static void
vdev_raidz_row_free(raidz_row_t *rr)
{
@@ -159,6 +397,17 @@ vdev_raidz_map_free(raidz_map_t *rm)
for (int i = 0; i < rm->rm_nrows; i++)
vdev_raidz_row_free(rm->rm_row[i]);
+ if (rm->rm_nphys_cols) {
+ for (int i = 0; i < rm->rm_nphys_cols; i++) {
+ if (rm->rm_phys_col[i].rc_abd != NULL)
+ abd_free(rm->rm_phys_col[i].rc_abd);
+ }
+
+ kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
+ rm->rm_nphys_cols);
+ }
+
+ ASSERT3P(rm->rm_lr, ==, NULL);
kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
}
@@ -170,10 +419,37 @@ vdev_raidz_map_free_vsd(zio_t *zio)
vdev_raidz_map_free(rm);
}
+static int
+vdev_raidz_reflow_compare(const void *x1, const void *x2)
+{
+ const reflow_node_t *l = x1;
+ const reflow_node_t *r = x2;
+
+ return (TREE_CMP(l->re_txg, r->re_txg));
+}
+
const zio_vsd_ops_t vdev_raidz_vsd_ops = {
.vsd_free = vdev_raidz_map_free_vsd,
};
+raidz_row_t *
+vdev_raidz_row_alloc(int cols)
+{
+ raidz_row_t *rr =
+ kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
+
+ rr->rr_cols = cols;
+ rr->rr_scols = cols;
+
+ for (int c = 0; c < cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_shadow_devidx = INT_MAX;
+ rc->rc_shadow_offset = UINT64_MAX;
+ rc->rc_allow_repair = 1;
+ }
+ return (rr);
+}
+
static void
vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
{
@@ -302,7 +578,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << ashift;
- uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t acols, scols;
raidz_map_t *rm =
kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
@@ -312,22 +588,22 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
*/
- q = s / (dcols - nparity);
+ uint64_t q = s / (dcols - nparity);
/*
* "Remainder": The number of partial stripe data sectors in this I/O.
* This will add a sector to some, but not all, child vdevs.
*/
- r = s - q * (dcols - nparity);
+ uint64_t r = s - q * (dcols - nparity);
/* The number of "big columns" - those which contain remainder data. */
- bc = (r == 0 ? 0 : r + nparity);
+ uint64_t bc = (r == 0 ? 0 : r + nparity);
/*
* The total number of data and parity sectors associated with
* this I/O.
*/
- tot = s + nparity * (q + (r == 0 ? 0 : 1));
+ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
/*
* acols: The columns that will be accessed.
@@ -343,43 +619,28 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
}
ASSERT3U(acols, <=, scols);
-
- rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
+ rr = vdev_raidz_row_alloc(scols);
rm->rm_row[0] = rr;
-
rr->rr_cols = acols;
- rr->rr_scols = scols;
rr->rr_bigcols = bc;
- rr->rr_missingdata = 0;
- rr->rr_missingparity = 0;
rr->rr_firstdatacol = nparity;
- rr->rr_abd_empty = NULL;
- rr->rr_nempty = 0;
#ifdef ZFS_DEBUG
rr->rr_offset = zio->io_offset;
rr->rr_size = zio->io_size;
#endif
- asize = 0;
+ uint64_t asize = 0;
- for (c = 0; c < scols; c++) {
+ for (uint64_t c = 0; c < scols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
- col = f + c;
- coff = o;
+ uint64_t col = f + c;
+ uint64_t coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << ashift;
}
rc->rc_devidx = col;
rc->rc_offset = coff;
- rc->rc_abd = NULL;
- rc->rc_orig_data = NULL;
- rc->rc_error = 0;
- rc->rc_tried = 0;
- rc->rc_skipped = 0;
- rc->rc_force_repair = 0;
- rc->rc_allow_repair = 1;
- rc->rc_need_orig_restore = B_FALSE;
if (c >= acols)
rc->rc_size = 0;
@@ -419,13 +680,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
- devidx = rr->rr_col[0].rc_devidx;
+ uint64_t devidx = rr->rr_col[0].rc_devidx;
o = rr->rr_col[0].rc_offset;
rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
rr->rr_col[1].rc_devidx = devidx;
rr->rr_col[1].rc_offset = o;
-
if (rm->rm_skipstart == 0)
rm->rm_skipstart = 1;
}
@@ -435,7 +695,338 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
} else {
vdev_raidz_map_alloc_read(zio, rm);
}
+ /* init RAIDZ parity ops */
+ rm->rm_ops = vdev_raidz_math_get_ops();
+
+ return (rm);
+}
+
+/*
+ * Everything before reflow_offset_synced should have been moved to the new
+ * location (read and write completed). However, this may not yet be reflected
+ * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
+ * uberblock has not yet been written). If reflow is not in progress,
+ * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
+ * entirely before reflow_offset_synced, it will come from the new location.
+ * Otherwise this row will come from the old location. Therefore, rows that
+ * straddle the reflow_offset_synced will come from the old location.
+ *
+ * For writes, reflow_offset_next is the next offset to copy. If a sector has
+ * been copied, but not yet reflected in the on-disk progress
+ * (reflow_offset_synced), it will also be written to the new (already copied)
+ * offset.
+ */
+noinline raidz_map_t *
+vdev_raidz_map_alloc_expanded(zio_t *zio,
+ uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+ uint64_t nparity, uint64_t reflow_offset_synced,
+ uint64_t reflow_offset_next, boolean_t use_scratch)
+{
+ abd_t *abd = zio->io_abd;
+ uint64_t offset = zio->io_offset;
+ uint64_t size = zio->io_size;
+
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = size >> ashift;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ * AKA "full rows"
+ */
+ uint64_t q = s / (logical_cols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ uint64_t r = s - q * (logical_cols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ uint64_t bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /* How many rows contain data (not skip) */
+ uint64_t rows = howmany(tot, logical_cols);
+ int cols = MIN(tot, logical_cols);
+
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+ KM_SLEEP);
+ rm->rm_nrows = rows;
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ rm->rm_skipstart = bc;
+ uint64_t asize = 0;
+
+ for (uint64_t row = 0; row < rows; row++) {
+ boolean_t row_use_scratch = B_FALSE;
+ raidz_row_t *rr = vdev_raidz_row_alloc(cols);
+ rm->rm_row[row] = rr;
+
+ /* The starting RAIDZ (parent) vdev sector of the row. */
+ uint64_t b = (offset >> ashift) + row * logical_cols;
+
+ /*
+ * If we are in the middle of a reflow, and the copying has
+ * not yet completed for any part of this row, then use the
+ * old location of this row. Note that reflow_offset_synced
+ * reflects the i/o that's been completed, because it's
+ * updated by a synctask, after zio_wait(spa_txg_zio[]).
+ * This is sufficient for our check, even if that progress
+ * has not yet been recorded to disk (reflected in
+ * spa_ubsync). Also note that we consider the last row to
+ * be "full width" (`cols`-wide rather than `bc`-wide) for
+ * this calculation. This causes a tiny bit of unnecessary
+ * double-writes but is safe and simpler to calculate.
+ */
+ int row_phys_cols = physical_cols;
+ if (b + cols > reflow_offset_synced >> ashift)
+ row_phys_cols--;
+ else if (use_scratch)
+ row_use_scratch = B_TRUE;
+
+ /* starting child of this row */
+ uint64_t child_id = b % row_phys_cols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t child_offset = (b / row_phys_cols) << ashift;
+
+ /*
+ * Note, rr_cols is the entire width of the block, even
+ * if this row is shorter. This is needed because parity
+ * generation (for Q and R) needs to know the entire width,
+ * because it treats the short row as though it was
+ * full-width (and the "phantom" sectors were zero-filled).
+ *
+ * Another approach to this would be to set cols shorter
+ * (to just the number of columns that we might do i/o to)
+ * and have another mechanism to tell the parity generation
+ * about the "entire width". Reconstruction (at least
+ * vdev_raidz_reconstruct_general()) would also need to
+ * know about the "entire width".
+ */
+ rr->rr_firstdatacol = nparity;
+#ifdef ZFS_DEBUG
+ /*
+ * note: rr_size is PSIZE, not ASIZE
+ */
+ rr->rr_offset = b << ashift;
+ rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
+#endif
+
+ for (int c = 0; c < rr->rr_cols; c++, child_id++) {
+ if (child_id >= row_phys_cols) {
+ child_id -= row_phys_cols;
+ child_offset += 1ULL << ashift;
+ }
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_devidx = child_id;
+ rc->rc_offset = child_offset;
+
+ /*
+ * Get this from the scratch space if appropriate.
+ * This only happens if we crashed in the middle of
+ * raidz_reflow_scratch_sync() (while it's running,
+ * the rangelock prevents us from doing concurrent
+ * io), and even then only during zpool import or
+ * when the pool is imported readonly.
+ */
+ if (row_use_scratch)
+ rc->rc_offset -= VDEV_BOOT_SIZE;
+
+ uint64_t dc = c - rr->rr_firstdatacol;
+ if (c < rr->rr_firstdatacol) {
+ rc->rc_size = 1ULL << ashift;
+
+ /*
+ * Parity sectors' rc_abd's are set below
+ * after determining if this is an aggregation.
+ */
+ } else if (row == rows - 1 && bc != 0 && c >= bc) {
+ /*
+ * Past the end of the block (even including
+ * skip sectors). This sector is part of the
+ * map so that we have full rows for p/q parity
+ * generation.
+ */
+ rc->rc_size = 0;
+ rc->rc_abd = NULL;
+ } else {
+ /* "data column" (col excluding parity) */
+ uint64_t off;
+
+ if (c < bc || r == 0) {
+ off = dc * rows + row;
+ } else {
+ off = r * rows +
+ (dc - r) * (rows - 1) + row;
+ }
+ rc->rc_size = 1ULL << ashift;
+ rc->rc_abd = abd_get_offset_struct(
+ &rc->rc_abdstruct, abd, off << ashift,
+ rc->rc_size);
+ }
+
+ if (rc->rc_size == 0)
+ continue;
+
+ /*
+ * If any part of this row is in both old and new
+ * locations, the primary location is the old
+ * location. If this sector was already copied to the
+ * new location, we need to also write to the new,
+ * "shadow" location.
+ *
+ * Note, `row_phys_cols != physical_cols` indicates
+ * that the primary location is the old location.
+ * `b+c < reflow_offset_next` indicates that the copy
+ * to the new location has been initiated. We know
+ * that the copy has completed because we have the
+ * rangelock, which is held exclusively while the
+ * copy is in progress.
+ */
+ if (row_use_scratch ||
+ (row_phys_cols != physical_cols &&
+ b + c < reflow_offset_next >> ashift)) {
+ rc->rc_shadow_devidx = (b + c) % physical_cols;
+ rc->rc_shadow_offset =
+ ((b + c) / physical_cols) << ashift;
+ if (row_use_scratch)
+ rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
+ }
+
+ asize += rc->rc_size;
+ }
+
+ /*
+ * See comment in vdev_raidz_map_alloc()
+ */
+ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+ (offset & (1ULL << 20))) {
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+ int devidx0 = rr->rr_col[0].rc_devidx;
+ uint64_t offset0 = rr->rr_col[0].rc_offset;
+ int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
+ uint64_t shadow_offset0 =
+ rr->rr_col[0].rc_shadow_offset;
+
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[0].rc_shadow_devidx =
+ rr->rr_col[1].rc_shadow_devidx;
+ rr->rr_col[0].rc_shadow_offset =
+ rr->rr_col[1].rc_shadow_offset;
+
+ rr->rr_col[1].rc_devidx = devidx0;
+ rr->rr_col[1].rc_offset = offset0;
+ rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
+ rr->rr_col[1].rc_shadow_offset = shadow_offset0;
+ }
+ }
+ ASSERT3U(asize, ==, tot << ashift);
+
+ /*
+ * Determine if the block is contiguous, in which case we can use
+ * an aggregation.
+ */
+ if (rows >= raidz_io_aggregate_rows) {
+ rm->rm_nphys_cols = physical_cols;
+ rm->rm_phys_col =
+ kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
+ KM_SLEEP);
+
+ /*
+ * Determine the aggregate io's offset and size, and check
+ * that the io is contiguous.
+ */
+ for (int i = 0;
+ i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ raidz_col_t *prc =
+ &rm->rm_phys_col[rc->rc_devidx];
+
+ if (rc->rc_size == 0)
+ continue;
+
+ if (prc->rc_size == 0) {
+ ASSERT0(prc->rc_offset);
+ prc->rc_offset = rc->rc_offset;
+ } else if (prc->rc_offset + prc->rc_size !=
+ rc->rc_offset) {
+ /*
+ * This block is not contiguous and
+ * therefore can't be aggregated.
+ * This is expected to be rare, so
+ * the cost of allocating and then
+ * freeing rm_phys_col is not
+ * significant.
+ */
+ kmem_free(rm->rm_phys_col,
+ sizeof (raidz_col_t) *
+ rm->rm_nphys_cols);
+ rm->rm_phys_col = NULL;
+ rm->rm_nphys_cols = 0;
+ break;
+ }
+ prc->rc_size += rc->rc_size;
+ }
+ }
+ }
+ if (rm->rm_phys_col != NULL) {
+ /*
+ * Allocate aggregate ABD's.
+ */
+ for (int i = 0; i < rm->rm_nphys_cols; i++) {
+ raidz_col_t *prc = &rm->rm_phys_col[i];
+
+ prc->rc_devidx = i;
+
+ if (prc->rc_size == 0)
+ continue;
+
+ prc->rc_abd =
+ abd_alloc_linear(rm->rm_phys_col[i].rc_size,
+ B_FALSE);
+ }
+ /*
+ * Point the parity abd's into the aggregate abd's.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_firstdatacol; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ raidz_col_t *prc =
+ &rm->rm_phys_col[rc->rc_devidx];
+ rc->rc_abd =
+ abd_get_offset_struct(&rc->rc_abdstruct,
+ prc->rc_abd,
+ rc->rc_offset - prc->rc_offset,
+ rc->rc_size);
+ }
+ }
+ } else {
+ /*
+ * Allocate new abd's for the parity sectors.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_firstdatacol; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_abd =
+ abd_alloc_linear(rc->rc_size,
+ B_TRUE);
+ }
+ }
+ }
/* init RAIDZ parity ops */
rm->rm_ops = vdev_raidz_math_get_ops();
@@ -453,11 +1044,11 @@ vdev_raidz_p_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
- int i, cnt = size / sizeof (src[0]);
+ int cnt = size / sizeof (src[0]);
ASSERT(pqr->p && !pqr->q && !pqr->r);
- for (i = 0; i < cnt; i++, src++, pqr->p++)
+ for (int i = 0; i < cnt; i++, src++, pqr->p++)
*pqr->p ^= *src;
return (0);
@@ -469,11 +1060,11 @@ vdev_raidz_pq_func(void *buf, size_t size, void *private)
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
- int i, cnt = size / sizeof (src[0]);
+ int cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && !pqr->r);
- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+ for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
@@ -488,11 +1079,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private)
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
- int i, cnt = size / sizeof (src[0]);
+ int cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && pqr->r);
- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+ for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
@@ -618,7 +1209,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
void
vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
{
- ASSERT3U(rr->rr_cols, !=, 0);
+ if (rr->rr_cols == 0) {
+ /*
+ * We are handling this block one row at a time (because
+ * this block has a different logical vs physical width,
+ * due to RAIDZ expansion), and this is a pad-only row,
+ * which has no parity.
+ */
+ return;
+ }
/* Generate using the new math implementation */
if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
@@ -648,10 +1247,10 @@ vdev_raidz_generate_parity(raidz_map_t *rm)
}
}
-/* ARGSUSED */
static int
vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
{
+ (void) private;
uint64_t *dst = dbuf;
uint64_t *src = sbuf;
int cnt = size / sizeof (src[0]);
@@ -663,11 +1262,11 @@ vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
return (0);
}
-/* ARGSUSED */
static int
vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
void *private)
{
+ (void) private;
uint64_t *dst = dbuf;
uint64_t *src = sbuf;
uint64_t mask;
@@ -681,10 +1280,10 @@ vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
return (0);
}
-/* ARGSUSED */
static int
vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
{
+ (void) private;
uint64_t *dst = buf;
uint64_t mask;
int cnt = size / sizeof (dst[0]);
@@ -770,6 +1369,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
int x = tgts[0];
abd_t *dst, *src;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
+
ASSERT3U(ntgts, ==, 1);
ASSERT3U(x, >=, rr->rr_firstdatacol);
ASSERT3U(x, <, rr->rr_cols);
@@ -802,6 +1404,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
int c, exp;
abd_t *dst, *src;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
+
ASSERT(ntgts == 1);
ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
@@ -848,6 +1453,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
int y = tgts[1];
abd_t *xd, *yd;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
+
ASSERT(ntgts == 2);
ASSERT(x < y);
ASSERT(x >= rr->rr_firstdatacol);
@@ -926,7 +1534,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
}
-/* BEGIN CSTYLED */
/*
* In the general case of reconstruction, we must solve the system of linear
* equations defined by the coefficients used to generate parity as well as
@@ -1078,7 +1685,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
* that reason, we only build the coefficients in the rows that correspond to
* targeted columns.
*/
-/* END CSTYLED */
static void
vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
@@ -1285,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
static void
vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
{
- int n, i, c, t, tt;
- int nmissing_rows;
+ int i, c, t, tt;
+ unsigned int n;
+ unsigned int nmissing_rows;
int missing_rows[VDEV_RAIDZ_MAXPARITY];
int parity_map[VDEV_RAIDZ_MAXPARITY];
uint8_t *p, *pp;
@@ -1297,11 +1904,14 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
abd_t **bufs = NULL;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
/*
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
* temporary linear ABDs if any non-linear ABDs are found.
*/
for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
+ ASSERT(rr->rr_col[i].rc_abd != NULL);
if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
KM_PUSHPAGE);
@@ -1429,10 +2039,23 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+ zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
+ rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
+ (int)rr->rr_missingparity);
+ }
+
nbadparity = rr->rr_firstdatacol;
nbaddata = rr->rr_cols - nbadparity;
ntgts = 0;
for (i = 0, c = 0; c < rr->rr_cols; c++) {
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+ zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
+ "offset=%llx error=%u)",
+ rr, c, (int)rr->rr_col[c].rc_devidx,
+ (long long)rr->rr_col[c].rc_offset,
+ (int)rr->rr_col[c].rc_error);
+ }
if (c < rr->rr_firstdatacol)
parity_valid[c] = B_FALSE;
@@ -1529,12 +2152,25 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
- *physical_ashift = MAX(*physical_ashift,
- cvd->vdev_physical_ashift);
}
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_open_error != 0)
+ continue;
+ *physical_ashift = vdev_best_ashift(*logical_ashift,
+ *physical_ashift, cvd->vdev_physical_ashift);
+ }
+
+ if (vd->vdev_rz_expanding) {
+ *asize *= vd->vdev_children - 1;
+ *max_asize *= vd->vdev_children - 1;
- *asize *= vd->vdev_children;
- *max_asize *= vd->vdev_children;
+ vd->vdev_min_asize = *asize;
+ } else {
+ *asize *= vd->vdev_children;
+ *max_asize *= vd->vdev_children;
+ }
if (numerrors > nparity) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
@@ -1553,19 +2189,70 @@ vdev_raidz_close(vdev_t *vd)
}
}
+/*
+ * Return the logical width to use, given the txg in which the allocation
+ * happened. Note that BP_GET_BIRTH() is usually the txg in which the
+ * BP was allocated. Remapped BP's (that were relocated due to device
+ * removal, see remap_blkptr_cb()), will have a more recent physical birth
+ * which reflects when the BP was relocated, but we can ignore these because
+ * they can't be on RAIDZ (device removal doesn't support RAIDZ).
+ */
static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
+{
+ reflow_node_t lookup = {
+ .re_txg = txg,
+ };
+ avl_index_t where;
+
+ uint64_t width;
+ mutex_enter(&vdrz->vd_expand_lock);
+ reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
+ if (re != NULL) {
+ width = re->re_logical_width;
+ } else {
+ re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
+ if (re != NULL)
+ width = re->re_logical_width;
+ else
+ width = vdrz->vd_original_width;
+ }
+ mutex_exit(&vdrz->vd_expand_lock);
+ return (width);
+}
+
+/*
+ * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
+ * more space due to the lower data-to-parity ratio. In this case it's
+ * important to pass in the correct txg. Note that vdev_gang_header_asize()
+ * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
+ * regardless of txg. This is assured because for a single data sector, we
+ * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
+ */
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vdrz->vd_logical_width;
+ uint64_t cols = vdrz->vd_original_width;
uint64_t nparity = vdrz->vd_nparity;
+ cols = vdev_raidz_get_logical_width(vdrz, txg);
+
asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
asize = roundup(asize, nparity + 1) << ashift;
+#ifdef ZFS_DEBUG
+ uint64_t asize_new = ((psize - 1) >> ashift) + 1;
+ uint64_t ncols_new = vdrz->vd_physical_width;
+ asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
+ (ncols_new - nparity));
+ asize_new = roundup(asize_new, nparity + 1) << ashift;
+ VERIFY3U(asize_new, <=, asize);
+#endif
+
return (asize);
}
@@ -1592,21 +2279,37 @@ vdev_raidz_child_done(zio_t *zio)
}
static void
-vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+vdev_raidz_shadow_child_done(zio_t *zio)
{
-#ifdef ZFS_DEBUG
- vdev_t *tvd = vd->vdev_top;
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_shadow_error = zio->io_error;
+}
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
+{
+ (void) rm;
+#ifdef ZFS_DEBUG
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = rr->rr_offset;
logical_rs.rs_end = logical_rs.rs_start +
- vdev_raidz_asize(vd, rr->rr_size);
+ vdev_raidz_asize(zio->io_vd, rr->rr_size,
+ BP_GET_BIRTH(zio->io_bp));
raidz_col_t *rc = &rr->rr_col[col];
- vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+ vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
ASSERT(vdev_xlate_is_empty(&remain_rs));
+ if (vdev_xlate_is_empty(&physical_rs)) {
+ /*
+ * If we are in the middle of expansion, the
+ * physical->logical mapping is changing so vdev_xlate()
+ * can't give us a reliable answer.
+ */
+ return;
+ }
ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
/*
@@ -1617,7 +2320,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
*/
if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
- rc->rc_size + (1 << tvd->vdev_ashift));
+ rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
} else {
ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
}
@@ -1625,7 +2328,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
}
static void
-vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
{
vdev_t *vd = zio->io_vd;
raidz_map_t *rm = zio->io_vsd;
@@ -1637,31 +2340,66 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
/* Verify physical to logical translation */
- vdev_raidz_io_verify(vd, rr, c);
+ vdev_raidz_io_verify(zio, rm, rr, c);
- if (rc->rc_size > 0) {
- ASSERT3P(rc->rc_abd, !=, NULL);
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd,
- abd_get_size(rc->rc_abd), zio->io_type,
- zio->io_priority, 0, vdev_raidz_child_done, rc));
- } else {
- /*
- * Generate optional write for skip sector to improve
- * aggregation contiguity.
- */
- ASSERT3P(rc->rc_abd, ==, NULL);
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, NULL, 1ULL << ashift,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
- NULL));
+ if (rc->rc_size == 0)
+ continue;
+
+ ASSERT3U(rc->rc_offset + rc->rc_size, <,
+ cvd->vdev_psize - VDEV_LABEL_END_SIZE);
+
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd,
+ abd_get_size(rc->rc_abd), zio->io_type,
+ zio->io_priority, 0, vdev_raidz_child_done, rc));
+
+ if (rc->rc_shadow_devidx != INT_MAX) {
+ vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
+
+ ASSERT3U(
+ rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
+ cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
+ rc->rc_shadow_offset, rc->rc_abd,
+ abd_get_size(rc->rc_abd),
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_shadow_child_done, rc));
}
}
}
+/*
+ * Generate optional I/Os for skip sectors to improve aggregation contiguity.
+ * This only works for vdev_raidz_map_alloc() (not _expanded()).
+ */
+static void
+raidz_start_skip_writes(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ raidz_map_t *rm = zio->io_vsd;
+ ASSERT3U(rm->rm_nrows, ==, 1);
+ raidz_row_t *rr = rm->rm_row[0];
+ for (int c = 0; c < rr->rr_scols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+ if (rc->rc_size != 0)
+ continue;
+ ASSERT3P(rc->rc_abd, ==, NULL);
+
+ ASSERT3U(rc->rc_offset, <,
+ cvd->vdev_psize - VDEV_LABEL_END_SIZE);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
+ NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+}
+
static void
-vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
+vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
{
vdev_t *vd = zio->io_vd;
@@ -1693,7 +2431,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_skipped = 1;
continue;
}
- if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+ if (forceparity ||
+ c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
@@ -1703,6 +2442,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
}
}
+static void
+vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
+{
+ vdev_t *vd = zio->io_vd;
+
+ for (int i = 0; i < rm->rm_nphys_cols; i++) {
+ raidz_col_t *prc = &rm->rm_phys_col[i];
+ if (prc->rc_size == 0)
+ continue;
+
+ ASSERT3U(prc->rc_devidx, ==, i);
+ vdev_t *cvd = vd->vdev_child[i];
+ if (!vdev_readable(cvd)) {
+ prc->rc_error = SET_ERROR(ENXIO);
+ prc->rc_tried = 1; /* don't even try */
+ prc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
+ prc->rc_error = SET_ERROR(ESTALE);
+ prc->rc_skipped = 1;
+ continue;
+ }
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ prc->rc_offset, prc->rc_abd, prc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, prc));
+ }
+}
+
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
+{
+ /*
+ * If there are multiple rows, we will be hitting
+ * all disks, so go ahead and read the parity so
+ * that we are reading in decent size chunks.
+ */
+ boolean_t forceparity = rm->rm_nrows > 1;
+
+ if (rm->rm_phys_col) {
+ vdev_raidz_io_start_read_phys_cols(zio, rm);
+ } else {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ vdev_raidz_io_start_read_row(zio, rr, forceparity);
+ }
+ }
+}
+
/*
* Start an IO operation on a RAIDZ VDev
*
@@ -1726,24 +2515,83 @@ vdev_raidz_io_start(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
vdev_raidz_t *vdrz = vd->vdev_tsd;
+ raidz_map_t *rm;
+
+ uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
+ BP_GET_BIRTH(zio->io_bp));
+ if (logical_width != vdrz->vd_physical_width) {
+ zfs_locked_range_t *lr = NULL;
+ uint64_t synced_offset = UINT64_MAX;
+ uint64_t next_offset = UINT64_MAX;
+ boolean_t use_scratch = B_FALSE;
+ /*
+ * Note: when the expansion is completing, we set
+ * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
+ * in a later txg than when we last update spa_ubsync's state
+ * (see the end of spa_raidz_expand_thread()). Therefore we
+ * may see vre_state!=SCANNING before
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
+ * on disk, but the copying progress has been synced to disk
+ * (and reflected in spa_ubsync). In this case it's fine to
+ * treat the expansion as completed, since if we crash there's
+ * no additional copying to do.
+ */
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+ ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
+ &vdrz->vn_vre);
+ lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
+ zio->io_offset, zio->io_size, RL_READER);
+ use_scratch =
+ (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
+ RRSS_SCRATCH_VALID);
+ synced_offset =
+ RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
+ next_offset = vdrz->vn_vre.vre_offset;
+ /*
+ * If we haven't resumed expanding since importing the
+ * pool, vre_offset won't have been set yet. In
+ * this case the next offset to be copied is the same
+ * as what was synced.
+ */
+ if (next_offset == UINT64_MAX) {
+ next_offset = synced_offset;
+ }
+ }
+ if (use_scratch) {
+ zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
+ "%lld next_offset=%lld use_scratch=%u",
+ zio,
+ zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
+ (long long)zio->io_offset,
+ (long long)synced_offset,
+ (long long)next_offset,
+ use_scratch);
+ }
+
+ rm = vdev_raidz_map_alloc_expanded(zio,
+ tvd->vdev_ashift, vdrz->vd_physical_width,
+ logical_width, vdrz->vd_nparity,
+ synced_offset, next_offset, use_scratch);
+ rm->rm_lr = lr;
+ } else {
+ rm = vdev_raidz_map_alloc(zio,
+ tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
+ }
+ rm->rm_original_width = vdrz->vd_original_width;
- raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
- vdrz->vd_logical_width, vdrz->vd_nparity);
zio->io_vsd = rm;
zio->io_vsd_ops = &vdev_raidz_vsd_ops;
-
- /*
- * Until raidz expansion is implemented all maps for a raidz vdev
- * contain a single row.
- */
- ASSERT3U(rm->rm_nrows, ==, 1);
- raidz_row_t *rr = rm->rm_row[0];
-
if (zio->io_type == ZIO_TYPE_WRITE) {
- vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_write(zio, rm->rm_row[i]);
+ }
+
+ if (logical_width == vdrz->vd_physical_width) {
+ raidz_start_skip_writes(zio);
+ }
} else {
ASSERT(zio->io_type == ZIO_TYPE_READ);
- vdev_raidz_io_start_read(zio, rr);
+ vdev_raidz_io_start_read(zio, rm);
}
zio_execute(zio);
@@ -1752,8 +2600,8 @@ vdev_raidz_io_start(zio_t *zio)
/*
* Report a checksum error for a child of a RAID-Z device.
*/
-static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
+void
+vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
{
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
@@ -1765,12 +2613,12 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
- (void) zfs_ereport_post_checksum(zio->io_spa, vd,
- &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
- rc->rc_abd, bad_data, &zbc);
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
+ (void) zfs_ereport_post_checksum(zio->io_spa, vd,
+ &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
+ rc->rc_abd, bad_data, &zbc);
}
}
@@ -1781,11 +2629,9 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
static int
raidz_checksum_verify(zio_t *zio)
{
- zio_bad_cksum_t zbc;
+ zio_bad_cksum_t zbc = {0};
raidz_map_t *rm = zio->io_vsd;
- bzero(&zbc, sizeof (zio_bad_cksum_t));
-
int ret = zio_checksum_error(zio, &zbc);
if (ret != 0 && zbc.zbc_injected != 0)
rm->rm_ecksuminjected = 1;
@@ -1819,11 +2665,19 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
if (!rc->rc_tried || rc->rc_error != 0)
continue;
- orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
- abd_copy(orig[c], rc->rc_abd, rc->rc_size);
+ orig[c] = rc->rc_abd;
+ ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
+ rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
}
/*
+ * Verify any empty sectors are zero filled to ensure the parity
+ * is calculated correctly even if these non-data sectors are damaged.
+ */
+ if (rr->rr_nempty && rr->rr_abd_empty != NULL)
+ ret += vdev_draid_map_verify_empty(zio, rr);
+
+ /*
* Regenerates parity even for !tried||rc_error!=0 columns. This
* isn't harmful but it does have the side effect of fixing stuff
* we didn't realize was necessary (i.e. even if we return 0).
@@ -1837,7 +2691,9 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
continue;
if (abd_cmp(orig[c], rc->rc_abd) != 0) {
- raidz_checksum_error(zio, rc, orig[c]);
+ zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
+ c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
+ vdev_raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
}
@@ -1852,8 +2708,10 @@ vdev_raidz_worst_error(raidz_row_t *rr)
{
int error = 0;
- for (int c = 0; c < rr->rr_cols; c++)
+ for (int c = 0; c < rr->rr_cols; c++) {
error = zio_worst_error(error, rr->rr_col[c].rc_error);
+ error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
+ }
return (error);
}
@@ -1882,6 +2740,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
parity_untried++;
}
+
+ if (rc->rc_force_repair)
+ unexpected_errors++;
}
/*
@@ -1897,7 +2758,6 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
(zio->io_flags & ZIO_FLAG_RESILVER)) {
int n = raidz_parity_verify(zio, rr);
unexpected_errors += n;
- ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
}
if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
@@ -1917,6 +2777,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
continue;
}
+ zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
+ "offset=%llx",
+ zio, c, rc->rc_devidx, (long long)rc->rc_offset);
+
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE,
@@ -1926,6 +2790,42 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
}
+
+ /*
+ * Scrub or resilver i/o's: overwrite any shadow locations with the
+ * good data. This ensures that if we've already copied this sector,
+ * it will be corrected if it was damaged. This writes more than is
+ * necessary, but since expansion is paused during scrub/resilver, at
+ * most a single row will have a shadow location.
+ */
+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+ (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *vd = zio->io_vd;
+
+ if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
+
+ /*
+ * Note: We don't want to update the repair stats
+ * because that would incorrectly indicate that there
+ * was bad data to repair, which we aren't sure about.
+ * By clearing the SCAN_THREAD flag, we prevent this
+ * from happening, despite having the REPAIR flag set.
+ * We need to set SELF_HEAL so that this i/o can't be
+ * bypassed by zio_vdev_io_start().
+ */
+ zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+ NULL, NULL);
+ cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
+ zio_nowait(cio);
+ }
+ }
}
static void
@@ -1945,6 +2845,43 @@ raidz_restore_orig_data(raidz_map_t *rm)
}
/*
+ * During raidz_reconstruct() for expanded VDEV, we need special consideration
+ * failure simulations. See note in raidz_reconstruct() on simulating failure
+ * of a pre-expansion device.
+ *
+ * Treating logical child i as failed, return TRUE if the given column should
+ * be treated as failed. The idea of logical children allows us to imagine
+ * that a disk silently failed before a RAIDZ expansion (reads from this disk
+ * succeed but return the wrong data). Since the expansion doesn't verify
+ * checksums, the incorrect data will be moved to new locations spread among
+ * the children (going diagonally across them).
+ *
+ * Higher "logical child failures" (values of `i`) indicate these
+ * "pre-expansion failures". The first physical_width values imagine that a
+ * current child failed; the next physical_width-1 values imagine that a
+ * child failed before the most recent expansion; the next physical_width-2
+ * values imagine a child failed in the expansion before that, etc.
+ */
+static boolean_t
+raidz_simulate_failure(int physical_width, int original_width, int ashift,
+ int i, raidz_col_t *rc)
+{
+ uint64_t sector_id =
+ physical_width * (rc->rc_offset >> ashift) +
+ rc->rc_devidx;
+
+ for (int w = physical_width; w >= original_width; w--) {
+ if (i < w) {
+ return (sector_id % w == i);
+ } else {
+ i -= w;
+ }
+ }
+ ASSERT(!"invalid logical child id");
+ return (B_FALSE);
+}
+
+/*
* returns EINVAL if reconstruction of the block will not be possible
* returns ECKSUM if this specific reconstruction failed
* returns 0 on successful reconstruction
@@ -1953,6 +2890,15 @@ static int
raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
{
raidz_map_t *rm = zio->io_vsd;
+ int physical_width = zio->io_vd->vdev_children;
+ int original_width = (rm->rm_original_width != 0) ?
+ rm->rm_original_width : physical_width;
+ int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
+
+ if (dbgmsg) {
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
+ "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
+ }
/* Reconstruct each row */
for (int r = 0; r < rm->rm_nrows; r++) {
@@ -1962,6 +2908,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
int dead = 0;
int dead_data = 0;
+ if (dbgmsg)
+ zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
+
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
ASSERT0(rc->rc_need_orig_restore);
@@ -1974,7 +2923,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
if (rc->rc_size == 0)
continue;
for (int lt = 0; lt < ntgts; lt++) {
- if (rc->rc_devidx == ltgts[lt]) {
+ if (raidz_simulate_failure(physical_width,
+ original_width,
+ zio->io_vd->vdev_top->vdev_ashift,
+ ltgts[lt], rc)) {
if (rc->rc_orig_data == NULL) {
rc->rc_orig_data =
abd_alloc_linear(
@@ -1987,13 +2939,37 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
dead++;
if (c >= nparity)
dead_data++;
- my_tgts[t++] = c;
+ /*
+ * Note: simulating failure of a
+ * pre-expansion device can hit more
+ * than one column, in which case we
+ * might try to simulate more failures
+ * than can be reconstructed, which is
+ * also more than the size of my_tgts.
+ * This check prevents accessing past
+ * the end of my_tgts. The "dead >
+ * nparity" check below will fail this
+ * reconstruction attempt.
+ */
+ if (t < VDEV_RAIDZ_MAXPARITY) {
+ my_tgts[t++] = c;
+ if (dbgmsg) {
+ zfs_dbgmsg("simulating "
+ "failure of col %u "
+ "devidx %u", c,
+ (int)rc->rc_devidx);
+ }
+ }
break;
}
}
}
if (dead > nparity) {
/* reconstruction not possible */
+ if (dbgmsg) {
+ zfs_dbgmsg("reconstruction not possible; "
+ "too many failures");
+ }
raidz_restore_orig_data(rm);
return (EINVAL);
}
@@ -2023,7 +2999,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
*/
if (rc->rc_error == 0 &&
c >= rr->rr_firstdatacol) {
- raidz_checksum_error(zio,
+ vdev_raidz_checksum_error(zio,
rc, rc->rc_orig_data);
rc->rc_error =
SET_ERROR(ECKSUM);
@@ -2037,11 +3013,19 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
zio_checksum_verified(zio);
+ if (dbgmsg) {
+ zfs_dbgmsg("reconstruction successful "
+ "(checksum verified)");
+ }
return (0);
}
/* Reconstruction failed - restore original data */
raidz_restore_orig_data(rm);
+ if (dbgmsg) {
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
+ "failed", zio);
+ }
return (ECKSUM);
}
@@ -2056,7 +3040,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
* The order that we find the various possible combinations of failed
* disks is dictated by these rules:
* - Examine each "slot" (the "i" in tgts[i])
- * - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ * - Try to increment this slot (tgts[i] += 1)
* - if we can't increment because it runs into the next slot,
* reset our slot to the minimum, and examine the next slot
*
@@ -2087,18 +3071,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
*
* This strategy works for dRAID but is less efficient when there are a large
* number of child vdevs and therefore permutations to check. Furthermore,
- * since the raidz_map_t rows likely do not overlap reconstruction would be
+ * since the raidz_map_t rows likely do not overlap, reconstruction would be
* possible as long as there are no more than nparity data errors per row.
* These additional permutations are not currently checked but could be as
* a future improvement.
+ *
+ * Returns 0 on success, ECKSUM on failure.
*/
static int
vdev_raidz_combrec(zio_t *zio)
{
int nparity = vdev_get_nparity(zio->io_vd);
raidz_map_t *rm = zio->io_vsd;
+ int physical_width = zio->io_vd->vdev_children;
+ int original_width = (rm->rm_original_width != 0) ?
+ rm->rm_original_width : physical_width;
- /* Check if there's enough data to attempt reconstrution. */
for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
int total_errors = 0;
@@ -2116,8 +3104,16 @@ vdev_raidz_combrec(zio_t *zio)
int tstore[VDEV_RAIDZ_MAXPARITY + 2];
int *ltgts = &tstore[1]; /* value is logical child ID */
- /* Determine number of logical children, n */
- int n = zio->io_vd->vdev_children;
+
+ /*
+ * Determine number of logical children, n. See comment
+ * above raidz_simulate_failure().
+ */
+ int n = 0;
+ for (int w = physical_width;
+ w >= original_width; w--) {
+ n += w;
+ }
ASSERT3U(num_failures, <=, nparity);
ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
@@ -2148,6 +3144,14 @@ vdev_raidz_combrec(zio_t *zio)
if (ltgts[t] == n) {
/* try more failures */
ASSERT3U(t, ==, num_failures - 1);
+ if (zfs_flags &
+ ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+ zfs_dbgmsg("reconstruction "
+ "failed for num_failures="
+ "%u; tried all "
+ "combinations",
+ num_failures);
+ }
break;
}
@@ -2159,7 +3163,7 @@ vdev_raidz_combrec(zio_t *zio)
* Try the next combination.
*/
if (ltgts[t] != ltgts[t + 1])
- break;
+ break; // found next combination
/*
* Otherwise, reset this tgt to the minimum,
@@ -2174,7 +3178,8 @@ vdev_raidz_combrec(zio_t *zio)
break;
}
}
-
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruction failed for all num_failures");
return (ECKSUM);
}
@@ -2199,7 +3204,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
static void
vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
{
- int total_errors = 0;
+ int normal_errors = 0;
+ int shadow_errors = 0;
ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
@@ -2208,24 +3214,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
- if (rc->rc_error) {
+ if (rc->rc_error != 0) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
-
- total_errors++;
+ normal_errors++;
+ }
+ if (rc->rc_shadow_error != 0) {
+ ASSERT(rc->rc_shadow_error != ECKSUM);
+ shadow_errors++;
}
}
/*
* Treat partial writes as a success. If we couldn't write enough
- * columns to reconstruct the data, the I/O failed. Otherwise,
- * good enough.
+ * columns to reconstruct the data, the I/O failed. Otherwise, good
+ * enough. Note that in the case of a shadow write (during raidz
+ * expansion), depending on if we crash, either the normal (old) or
+ * shadow (new) location may become the "real" version of the block,
+ * so both locations must have sufficient redundancy.
*
* Now that we support write reallocation, it would be better
* to treat partial failure as real failure unless there are
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
- if (total_errors > rr->rr_firstdatacol) {
+ if (normal_errors > rr->rr_firstdatacol ||
+ shadow_errors > rr->rr_firstdatacol) {
zio->io_error = zio_worst_error(zio->io_error,
vdev_raidz_worst_error(rr));
}
@@ -2242,14 +3255,24 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
- if (rc->rc_error) {
- ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+ /*
+ * If scrubbing and a replacing/sparing child vdev determined
+ * that not all of its children have an identical copy of the
+ * data, then clear the error so the column is treated like
+ * any other read and force a repair to correct the damage.
+ */
+ if (rc->rc_error == ECKSUM) {
+ ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
+ vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
+ rc->rc_force_repair = 1;
+ rc->rc_error = 0;
+ }
+ if (rc->rc_error) {
if (c < rr->rr_firstdatacol)
parity_errors++;
else
@@ -2314,7 +3337,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
* for a normal read then allocate an ABD for them now so they
* may be read, verified, and any needed repairs performed.
*/
- if (rr->rr_nempty && rr->rr_abd_empty == NULL)
+ if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
vdev_draid_map_alloc_empty(zio, rr);
for (int c = 0; c < rr->rr_cols; c++) {
@@ -2357,12 +3380,12 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
- (void) zfs_ereport_start_checksum(zio->io_spa,
- cvd, &zio->io_bookmark, zio, rc->rc_offset,
- rc->rc_size, &zbc);
mutex_enter(&cvd->vdev_stat_lock);
cvd->vdev_stat.vs_checksum_errors++;
mutex_exit(&cvd->vdev_stat_lock);
+ (void) zfs_ereport_start_checksum(zio->io_spa,
+ cvd, &zio->io_bookmark, zio, rc->rc_offset,
+ rc->rc_size, &zbc);
}
}
}
@@ -2372,11 +3395,48 @@ vdev_raidz_io_done(zio_t *zio)
{
raidz_map_t *rm = zio->io_vsd;
+ ASSERT(zio->io_bp != NULL);
if (zio->io_type == ZIO_TYPE_WRITE) {
for (int i = 0; i < rm->rm_nrows; i++) {
vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
}
} else {
+ if (rm->rm_phys_col) {
+ /*
+ * This is an aggregated read. Copy the data and status
+ * from the aggregate abd's to the individual rows.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_tried || rc->rc_size == 0)
+ continue;
+
+ raidz_col_t *prc =
+ &rm->rm_phys_col[rc->rc_devidx];
+ rc->rc_error = prc->rc_error;
+ rc->rc_tried = prc->rc_tried;
+ rc->rc_skipped = prc->rc_skipped;
+ if (c >= rr->rr_firstdatacol) {
+ /*
+ * Note: this is slightly faster
+ * than using abd_copy_off().
+ */
+ char *physbuf = abd_to_buf(
+ prc->rc_abd);
+ void *physloc = physbuf +
+ rc->rc_offset -
+ prc->rc_offset;
+
+ abd_copy_from_buf(rc->rc_abd,
+ physloc, rc->rc_size);
+ }
+ }
+ }
+ }
+
for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_reconstruct_known_missing(zio,
@@ -2423,7 +3483,54 @@ vdev_raidz_io_done(zio_t *zio)
zio_vdev_io_redone(zio);
return;
}
-
+ /*
+ * It would be too expensive to try every possible
+ * combination of failed sectors in every row, so
+ * instead we try every combination of failed current or
+ * past physical disk. This means that if the incorrect
+ * sectors were all on Nparity disks at any point in the
+ * past, we will find the correct data. The only known
+ * case where this is less durable than a non-expanded
+ * RAIDZ, is if we have a silent failure during
+ * expansion. In that case, one block could be
+ * partially in the old format and partially in the
+ * new format, so we'd lost some sectors from the old
+ * format and some from the new format.
+ *
+ * e.g. logical_width=4 physical_width=6
+ * the 15 (6+5+4) possible failed disks are:
+ * width=6 child=0
+ * width=6 child=1
+ * width=6 child=2
+ * width=6 child=3
+ * width=6 child=4
+ * width=6 child=5
+ * width=5 child=0
+ * width=5 child=1
+ * width=5 child=2
+ * width=5 child=3
+ * width=5 child=4
+ * width=4 child=0
+ * width=4 child=1
+ * width=4 child=2
+ * width=4 child=3
+ * And we will try every combination of Nparity of these
+ * failing.
+ *
+ * As a first pass, we can generate every combo,
+ * and try reconstructing, ignoring any known
+ * failures. If any row has too many known + simulated
+ * failures, then we bail on reconstructing with this
+ * number of simulated failures. As an improvement,
+ * we could detect the number of whole known failures
+ * (i.e. we have known failures on these disks for
+ * every row; the disks never succeeded), and
+ * subtract that from the max # failures to simulate.
+ * We could go even further like the current
+ * combrec code, but that doesn't seem like it
+ * gains us very much. If we simulate a failure
+ * that is also a known failure, that's fine.
+ */
zio->io_error = vdev_raidz_combrec(zio);
if (zio->io_error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -2431,6 +3538,10 @@ vdev_raidz_io_done(zio_t *zio)
}
}
}
+ if (rm->rm_lr != NULL) {
+ zfs_rangelock_exit(rm->rm_lr);
+ rm->rm_lr = NULL;
+ }
}
static void
@@ -2457,6 +3568,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ /*
+ * If we're in the middle of a RAIDZ expansion, this block may be in
+ * the old and/or new location. For simplicity, always resilver it.
+ */
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING)
+ return (B_TRUE);
+
uint64_t dcols = vd->vdev_children;
uint64_t nparity = vdrz->vd_nparity;
uint64_t ashift = vd->vdev_top->vdev_ashift;
@@ -2496,10 +3615,29 @@ static void
vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
+ (void) remain_rs;
+
vdev_t *raidvd = cvd->vdev_parent;
ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
- uint64_t width = raidvd->vdev_children;
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+ /*
+ * We're in the middle of expansion, in which case the
+ * translation is in flux. Any answer we give may be wrong
+ * by the time we return, so it isn't safe for the caller to
+ * act on it. Therefore we say that this range isn't present
+ * on any children. The only consumers of this are "zpool
+ * initialize" and trimming, both of which are "best effort"
+ * anyway.
+ */
+ physical_rs->rs_start = physical_rs->rs_end = 0;
+ remain_rs->rs_start = remain_rs->rs_end = 0;
+ return;
+ }
+
+ uint64_t width = vdrz->vd_physical_width;
uint64_t tgt_col = cvd->vdev_id;
uint64_t ashift = raidvd->vdev_top->vdev_ashift;
@@ -2525,15 +3663,1156 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
logical_rs->rs_end - logical_rs->rs_start);
}
+static void
+raidz_reflow_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+ /*
+ * Ensure there are no i/os to the range that is being committed.
+ */
+ uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
+ ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
+
+ mutex_enter(&vre->vre_lock);
+ uint64_t new_offset =
+ MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
+ /*
+ * We should not have committed anything that failed.
+ */
+ VERIFY3U(vre->vre_failed_offset, >=, old_offset);
+ mutex_exit(&vre->vre_lock);
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
+ old_offset, new_offset - old_offset,
+ RL_WRITER);
+
+ /*
+ * Update the uberblock that will be written when this txg completes.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_uberblock,
+ RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
+ vre->vre_offset_pertxg[txgoff] = 0;
+ zfs_rangelock_exit(lr);
+
+ mutex_enter(&vre->vre_lock);
+ vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
+ vre->vre_bytes_copied_pertxg[txgoff] = 0;
+ mutex_exit(&vre->vre_lock);
+
+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
+ sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
+}
+
+static void
+raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+
+ for (int i = 0; i < TXG_SIZE; i++)
+ VERIFY0(vre->vre_offset_pertxg[i]);
+
+ reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
+ re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
+ re->re_logical_width = vdrz->vd_physical_width;
+ mutex_enter(&vdrz->vd_expand_lock);
+ avl_add(&vdrz->vd_expand_txgs, re);
+ mutex_exit(&vdrz->vd_expand_lock);
+
+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+ /*
+ * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
+ * will get written (based on vd_expand_txgs).
+ */
+ vdev_config_dirty(vd);
+
+ /*
+ * Before we change vre_state, the on-disk state must reflect that we
+ * have completed all copying, so that vdev_raidz_io_start() can use
+ * vre_state to determine if the reflow is in progress. See also the
+ * end of spa_raidz_expand_thread().
+ */
+ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
+ raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
+
+ vre->vre_end_time = gethrestime_sec();
+ vre->vre_state = DSS_FINISHED;
+
+ uint64_t state = vre->vre_state;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+ sizeof (state), 1, &state, tx));
+
+ uint64_t end_time = vre->vre_end_time;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
+ sizeof (end_time), 1, &end_time, tx));
+
+ spa->spa_uberblock.ub_raidz_reflow_info = 0;
+
+ spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
+ "%s vdev %llu new width %llu", spa_name(spa),
+ (unsigned long long)vd->vdev_id,
+ (unsigned long long)vd->vdev_children);
+
+ spa->spa_raidz_expand = NULL;
+ raidvd->vdev_rz_expanding = B_FALSE;
+
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+ spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+ spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
+
+ spa_notify_waiters(spa);
+
+ /*
+ * While we're in syncing context take the opportunity to
+ * setup a scrub. All the data has been sucessfully copied
+ * but we have not validated any checksums.
+ */
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
+ dsl_scan_setup_sync(&func, tx);
+}
+
+/*
+ * Struct for one copy zio.
+ */
+typedef struct raidz_reflow_arg {
+ vdev_raidz_expand_t *rra_vre;
+ zfs_locked_range_t *rra_lr;
+ uint64_t rra_txg;
+} raidz_reflow_arg_t;
+
+/*
+ * The write of the new location is done.
+ */
+static void
+raidz_reflow_write_done(zio_t *zio)
+{
+ raidz_reflow_arg_t *rra = zio->io_private;
+ vdev_raidz_expand_t *vre = rra->rra_vre;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&vre->vre_lock);
+ if (zio->io_error != 0) {
+ /* Force a reflow pause on errors */
+ vre->vre_failed_offset =
+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+ }
+ ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
+ vre->vre_outstanding_bytes -= zio->io_size;
+ if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
+ vre->vre_failed_offset) {
+ vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
+ zio->io_size;
+ }
+ cv_signal(&vre->vre_cv);
+ mutex_exit(&vre->vre_lock);
+
+ zfs_rangelock_exit(rra->rra_lr);
+
+ kmem_free(rra, sizeof (*rra));
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
+static void
+raidz_reflow_read_done(zio_t *zio)
+{
+ raidz_reflow_arg_t *rra = zio->io_private;
+ vdev_raidz_expand_t *vre = rra->rra_vre;
+
+ /*
+ * If the read failed, or if it was done on a vdev that is not fully
+ * healthy (e.g. a child that has a resilver in progress), we may not
+ * have the correct data. Note that it's OK if the write proceeds.
+ * It may write garbage but the location is otherwise unused and we
+ * will retry later due to vre_failed_offset.
+ */
+ if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
+ zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
+ "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
+ (long long)rra->rra_lr->lr_offset,
+ (long long)rra->rra_lr->lr_length,
+ (long long)rra->rra_txg,
+ zio->io_error,
+ vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
+ vdev_dtl_empty(zio->io_vd, DTL_MISSING));
+ mutex_enter(&vre->vre_lock);
+ /* Force a reflow pause on errors */
+ vre->vre_failed_offset =
+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+ mutex_exit(&vre->vre_lock);
+ }
+
+ zio_nowait(zio_unique_parent(zio));
+}
+
+static void
+raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
+ dmu_tx_t *tx)
+{
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (offset == 0)
+ return;
+
+ mutex_enter(&vre->vre_lock);
+ ASSERT3U(vre->vre_offset, <=, offset);
+ vre->vre_offset = offset;
+ mutex_exit(&vre->vre_lock);
+
+ if (vre->vre_offset_pertxg[txgoff] == 0) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
+ spa, tx);
+ }
+ vre->vre_offset_pertxg[txgoff] = offset;
+}
+
+static boolean_t
+vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
+{
+ for (int i = 0; i < raidz_vd->vdev_children; i++) {
+ /* Quick check if a child is being replaced */
+ if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ int ashift = vd->vdev_top->vdev_ashift;
+ uint64_t offset, size;
+
+ if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
+ &offset, &size)) {
+ return (B_FALSE);
+ }
+ ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
+ ASSERT3U(size, >=, 1 << ashift);
+ uint64_t length = 1 << ashift;
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ uint64_t blkid = offset >> ashift;
+
+ int old_children = vd->vdev_children - 1;
+
+ /*
+ * We can only progress to the point that writes will not overlap
+ * with blocks whose progress has not yet been recorded on disk.
+ * Since partially-copied rows are still read from the old location,
+ * we need to stop one row before the sector-wise overlap, to prevent
+ * row-wise overlap.
+ *
+ * Note that even if we are skipping over a large unallocated region,
+ * we can't move the on-disk progress to `offset`, because concurrent
+ * writes/allocations could still use the currently-unallocated
+ * region.
+ */
+ uint64_t ubsync_blkid =
+ RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
+ uint64_t next_overwrite_blkid = ubsync_blkid +
+ ubsync_blkid / old_children - old_children;
+ VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
+
+ if (blkid >= next_overwrite_blkid) {
+ raidz_reflow_record_progress(vre,
+ next_overwrite_blkid << ashift, tx);
+ return (B_TRUE);
+ }
+
+ range_tree_remove(rt, offset, length);
+
+ raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
+ rra->rra_vre = vre;
+ rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
+ offset, length, RL_WRITER);
+ rra->rra_txg = dmu_tx_get_txg(tx);
+
+ raidz_reflow_record_progress(vre, offset + length, tx);
+
+ mutex_enter(&vre->vre_lock);
+ vre->vre_outstanding_bytes += length;
+ mutex_exit(&vre->vre_lock);
+
+ /*
+ * SCL_STATE will be released when the read and write are done,
+ * by raidz_reflow_write_done().
+ */
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+ /* check if a replacing vdev was added, if so treat it as an error */
+ if (vdev_raidz_expand_child_replacing(vd)) {
+ zfs_dbgmsg("replacing vdev encountered, reflow paused at "
+ "offset=%llu txg=%llu",
+ (long long)rra->rra_lr->lr_offset,
+ (long long)rra->rra_txg);
+
+ mutex_enter(&vre->vre_lock);
+ vre->vre_failed_offset =
+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+ cv_signal(&vre->vre_cv);
+ mutex_exit(&vre->vre_lock);
+
+ /* drop everything we acquired */
+ zfs_rangelock_exit(rra->rra_lr);
+ kmem_free(rra, sizeof (*rra));
+ spa_config_exit(spa, SCL_STATE, spa);
+ return (B_TRUE);
+ }
+
+ zio_t *pio = spa->spa_txg_zio[txgoff];
+ abd_t *abd = abd_alloc_for_io(length, B_FALSE);
+ zio_t *write_zio = zio_vdev_child_io(pio, NULL,
+ vd->vdev_child[blkid % vd->vdev_children],
+ (blkid / vd->vdev_children) << ashift,
+ abd, length,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ raidz_reflow_write_done, rra);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ vd->vdev_child[blkid % old_children],
+ (blkid / old_children) << ashift,
+ abd, length,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ raidz_reflow_read_done, rra));
+
+ return (B_FALSE);
+}
+
+/*
+ * For testing (ztest specific)
+ */
+static void
+raidz_expand_pause(uint_t pause_point)
+{
+ while (raidz_expand_pause_point != 0 &&
+ raidz_expand_pause_point <= pause_point)
+ delay(hz);
+}
+
+static void
+raidz_scratch_child_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_private;
+
+ mutex_enter(&pio->io_lock);
+ pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
+ mutex_exit(&pio->io_lock);
+}
+
+/*
+ * Reflow the beginning portion of the vdev into an intermediate scratch area
+ * in memory and on disk. This operation must be persisted on disk before we
+ * proceed to overwrite the beginning portion with the reflowed data.
+ *
+ * This multi-step task can fail to complete if disk errors are encountered
+ * and we can return here after a pause (waiting for disk to become healthy).
+ */
+static void
+raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_raidz_expand_t *vre = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ zio_t *pio;
+ int error;
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ int ashift = raidvd->vdev_ashift;
+ uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
+ uint64_t);
+ uint64_t logical_size = write_size * raidvd->vdev_children;
+ uint64_t read_size =
+ P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
+ 1 << ashift);
+
+ /*
+ * The scratch space must be large enough to get us to the point
+ * that one row does not overlap itself when moved. This is checked
+ * by vdev_raidz_attach_check().
+ */
+ VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
+ VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
+ VERIFY3U(write_size, <=, read_size);
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
+ 0, logical_size, RL_WRITER);
+
+ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
+ KM_SLEEP);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ abds[i] = abd_alloc_linear(read_size, B_FALSE);
+ }
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
+
+ /*
+ * If we have already written the scratch area then we must read from
+ * there, since new writes were redirected there while we were paused
+ * or the original location may have been partially overwritten with
+ * reflowed data.
+ */
+ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
+ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
+ /*
+ * Read from scratch space.
+ */
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ /*
+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
+ * to the offset to calculate the physical offset to
+ * write to. Passing in a negative offset makes us
+ * access the scratch area.
+ */
+ zio_nowait(zio_vdev_child_io(pio, NULL,
+ raidvd->vdev_child[i],
+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+ write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ zfs_dbgmsg("reflow: error %d reading scratch location",
+ error);
+ goto io_error_exit;
+ }
+ goto overwrite;
+ }
+
+ /*
+ * Read from original location.
+ */
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children - 1; i++) {
+ ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ 0, abds[i], read_size, ZIO_TYPE_READ,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ zfs_dbgmsg("reflow: error %d reading original location", error);
+io_error_exit:
+ for (int i = 0; i < raidvd->vdev_children; i++)
+ abd_free(abds[i]);
+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+ zfs_rangelock_exit(lr);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ return;
+ }
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
+
+ /*
+ * Reflow in memory.
+ */
+ uint64_t logical_sectors = logical_size >> ashift;
+ for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
+ int oldchild = i % (raidvd->vdev_children - 1);
+ uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
+
+ int newchild = i % raidvd->vdev_children;
+ uint64_t newoff = (i / raidvd->vdev_children) << ashift;
+
+ /* a single sector should not be copying over itself */
+ ASSERT(!(newchild == oldchild && newoff == oldoff));
+
+ abd_copy_off(abds[newchild], abds[oldchild],
+ newoff, oldoff, 1 << ashift);
+ }
+
+ /*
+ * Verify that we filled in everything we intended to (write_size on
+ * each child).
+ */
+ VERIFY0(logical_sectors % raidvd->vdev_children);
+ VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
+ write_size);
+
+ /*
+ * Write to scratch location (boot area).
+ */
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ /*
+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
+ * the offset to calculate the physical offset to write to.
+ * Passing in a negative offset lets us access the boot area.
+ */
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+ write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ zfs_dbgmsg("reflow: error %d writing scratch location", error);
+ goto io_error_exit;
+ }
+ pio = zio_root(spa, NULL, NULL, 0);
+ zio_flush(pio, raidvd);
+ zio_wait(pio);
+
+ zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
+ (long long)logical_size);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
+
+ /*
+ * Update uberblock to indicate that scratch space is valid. This is
+ * needed because after this point, the real location may be
+ * overwritten. If we crash, we need to get the data from the
+ * scratch space, rather than the real location.
+ *
+ * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
+ * will prefer this uberblock.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
+ spa->spa_ubsync.ub_timestamp++;
+ ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+ zfs_dbgmsg("reflow: uberblock updated "
+ "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
+ (long long)spa->spa_ubsync.ub_txg,
+ (long long)logical_size,
+ (long long)spa->spa_ubsync.ub_timestamp);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
+
+ /*
+ * Overwrite with reflow'ed data.
+ */
+overwrite:
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ 0, abds[i], write_size, ZIO_TYPE_WRITE,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
+ raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ /*
+ * When we exit early here and drop the range lock, new
+ * writes will go into the scratch area so we'll need to
+ * read from there when we return after pausing.
+ */
+ zfs_dbgmsg("reflow: error %d writing real location", error);
+ /*
+ * Update the uberblock that is written when this txg completes.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
+ logical_size);
+ goto io_error_exit;
+ }
+ pio = zio_root(spa, NULL, NULL, 0);
+ zio_flush(pio, raidvd);
+ zio_wait(pio);
+
+ zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
+ (long long)logical_size);
+ for (int i = 0; i < raidvd->vdev_children; i++)
+ abd_free(abds[i]);
+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
+
+ /*
+ * Update uberblock to indicate that the initial part has been
+ * reflow'ed. This is needed because after this point (when we exit
+ * the rangelock), we allow regular writes to this region, which will
+ * be written to the new location only (because reflow_offset_next ==
+ * reflow_offset_synced). If we crashed and re-copied from the
+ * scratch space, we would lose the regular writes.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
+ logical_size);
+ spa->spa_ubsync.ub_timestamp++;
+ ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+ zfs_dbgmsg("reflow: uberblock updated "
+ "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
+ (long long)spa->spa_ubsync.ub_txg,
+ (long long)logical_size,
+ (long long)spa->spa_ubsync.ub_timestamp);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
+
+ /*
+ * Update progress.
+ */
+ vre->vre_offset = logical_size;
+ zfs_rangelock_exit(lr);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
+ vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
+ /*
+ * Note - raidz_reflow_sync() will update the uberblock state to
+ * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
+ */
+ raidz_reflow_sync(spa, tx);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
+}
+
+/*
+ * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
+ * here. No other i/o can be in progress, so we don't need the vre_rangelock.
+ */
+void
+vdev_raidz_reflow_copy_scratch(spa_t *spa)
+{
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
+ ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ ASSERT0(logical_size % raidvd->vdev_children);
+ uint64_t write_size = logical_size / raidvd->vdev_children;
+
+ zio_t *pio;
+
+ /*
+ * Read from scratch space.
+ */
+ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
+ KM_SLEEP);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ abds[i] = abd_alloc_linear(write_size, B_FALSE);
+ }
+
+ pio = zio_root(spa, NULL, NULL, 0);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ /*
+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
+ * the offset to calculate the physical offset to write to.
+ * Passing in a negative offset lets us access the boot area.
+ */
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+ write_size, ZIO_TYPE_READ,
+ ZIO_PRIORITY_ASYNC_READ, 0,
+ raidz_scratch_child_done, pio));
+ }
+ zio_wait(pio);
+
+ /*
+ * Overwrite real location with reflow'ed data.
+ */
+ pio = zio_root(spa, NULL, NULL, 0);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ 0, abds[i], write_size, ZIO_TYPE_WRITE,
+ ZIO_PRIORITY_ASYNC_WRITE, 0,
+ raidz_scratch_child_done, pio));
+ }
+ zio_wait(pio);
+ pio = zio_root(spa, NULL, NULL, 0);
+ zio_flush(pio, raidvd);
+ zio_wait(pio);
+
+ zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
+ "to real location", (long long)logical_size);
+
+ for (int i = 0; i < raidvd->vdev_children; i++)
+ abd_free(abds[i]);
+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+
+ /*
+ * Update uberblock.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_ubsync,
+ RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
+ spa->spa_ubsync.ub_timestamp++;
+ VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+ zfs_dbgmsg("reflow recovery: uberblock updated "
+ "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
+ (long long)spa->spa_ubsync.ub_txg,
+ (long long)logical_size,
+ (long long)spa->spa_ubsync.ub_timestamp);
+
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+ spa_first_txg(spa));
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ vre->vre_offset = logical_size;
+ vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
+ vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
+ /*
+ * Note that raidz_reflow_sync() will update the uberblock once more
+ */
+ raidz_reflow_sync(spa, tx);
+
+ dmu_tx_commit(tx);
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+}
+
+static boolean_t
+spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
+{
+ (void) zthr;
+ spa_t *spa = arg;
+
+ return (spa->spa_raidz_expand != NULL &&
+ !spa->spa_raidz_expand->vre_waiting_for_resilver);
+}
+
+/*
+ * RAIDZ expansion background thread
+ *
+ * Can be called multiple times if the reflow is paused
+ */
+static void
+spa_raidz_expand_thread(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
+ vre->vre_offset = 0;
+ else
+ vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
+
+ /* Reflow the begining portion using the scratch area */
+ if (vre->vre_offset == 0) {
+ VERIFY0(dsl_sync_task(spa_name(spa),
+ NULL, raidz_reflow_scratch_sync,
+ vre, 0, ZFS_SPACE_CHECK_NONE));
+
+ /* if we encountered errors then pause */
+ if (vre->vre_offset == 0) {
+ mutex_enter(&vre->vre_lock);
+ vre->vre_waiting_for_resilver = B_TRUE;
+ mutex_exit(&vre->vre_lock);
+ return;
+ }
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+ uint64_t guid = raidvd->vdev_guid;
+
+ /* Iterate over all the remaining metaslabs */
+ for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
+ i < raidvd->vdev_ms_count &&
+ !zthr_iscancelled(zthr) &&
+ vre->vre_failed_offset == UINT64_MAX; i++) {
+ metaslab_t *msp = raidvd->vdev_ms[i];
+
+ metaslab_disable(msp);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * The metaslab may be newly created (for the expanded
+ * space), in which case its trees won't exist yet,
+ * so we need to bail out early.
+ */
+ if (msp->ms_new) {
+ mutex_exit(&msp->ms_lock);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+ continue;
+ }
+
+ VERIFY0(metaslab_load(msp));
+
+ /*
+ * We want to copy everything except the free (allocatable)
+ * space. Note that there may be a little bit more free
+ * space (e.g. in ms_defer), and it's fine to copy that too.
+ */
+ range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
+ NULL, 0, 0);
+ range_tree_add(rt, msp->ms_start, msp->ms_size);
+ range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
+ mutex_exit(&msp->ms_lock);
+
+ /*
+ * Force the last sector of each metaslab to be copied. This
+ * ensures that we advance the on-disk progress to the end of
+ * this metaslab while the metaslab is disabled. Otherwise, we
+ * could move past this metaslab without advancing the on-disk
+ * progress, and then an allocation to this metaslab would not
+ * be copied.
+ */
+ int sectorsz = 1 << raidvd->vdev_ashift;
+ uint64_t ms_last_offset = msp->ms_start +
+ msp->ms_size - sectorsz;
+ if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
+ range_tree_add(rt, ms_last_offset, sectorsz);
+ }
+
+ /*
+ * When we are resuming from a paused expansion (i.e.
+ * when importing a pool with a expansion in progress),
+ * discard any state that we have already processed.
+ */
+ range_tree_clear(rt, 0, vre->vre_offset);
+
+ while (!zthr_iscancelled(zthr) &&
+ !range_tree_is_empty(rt) &&
+ vre->vre_failed_offset == UINT64_MAX) {
+
+ /*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * If requested, pause the reflow when the amount
+ * specified by raidz_expand_max_reflow_bytes is reached
+ *
+ * This pause is only used during testing or debugging.
+ */
+ while (raidz_expand_max_reflow_bytes != 0 &&
+ raidz_expand_max_reflow_bytes <=
+ vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
+ delay(hz);
+ }
+
+ mutex_enter(&vre->vre_lock);
+ while (vre->vre_outstanding_bytes >
+ raidz_expand_max_copy_bytes) {
+ cv_wait(&vre->vre_cv, &vre->vre_lock);
+ }
+ mutex_exit(&vre->vre_lock);
+
+ dmu_tx_t *tx =
+ dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ /*
+ * Reacquire the vdev_config lock. Theoretically, the
+ * vdev_t that we're expanding may have changed.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+ boolean_t needsync =
+ raidz_reflow_impl(raidvd, vre, rt, tx);
+
+ dmu_tx_commit(tx);
+
+ if (needsync) {
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+ spa_config_enter(spa, SCL_CONFIG, FTAG,
+ RW_READER);
+ }
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+ range_tree_vacate(rt, NULL, NULL);
+ range_tree_destroy(rt);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * The txg_wait_synced() here ensures that all reflow zio's have
+ * completed, and vre_failed_offset has been set if necessary. It
+ * also ensures that the progress of the last raidz_reflow_sync() is
+ * written to disk before raidz_reflow_complete_sync() changes the
+ * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
+ * determine if a reflow is in progress, in which case we may need to
+ * write to both old and new locations. Therefore we can only change
+ * vre_state once this is not necessary, which is once the on-disk
+ * progress (in spa_ubsync) has been set past any possible writes (to
+ * the end of the last metaslab).
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ if (!zthr_iscancelled(zthr) &&
+ vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
+ /*
+ * We are not being canceled or paused, so the reflow must be
+ * complete. In that case also mark it as completed on disk.
+ */
+ ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ raidz_reflow_complete_sync, spa,
+ 0, ZFS_SPACE_CHECK_NONE));
+ (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+ } else {
+ /*
+ * Wait for all copy zio's to complete and for all the
+ * raidz_reflow_sync() synctasks to be run.
+ */
+ spa_history_log_internal(spa, "reflow pause",
+ NULL, "offset=%llu failed_offset=%lld",
+ (long long)vre->vre_offset,
+ (long long)vre->vre_failed_offset);
+ mutex_enter(&vre->vre_lock);
+ if (vre->vre_failed_offset != UINT64_MAX) {
+ /*
+ * Reset progress so that we will retry everything
+ * after the point that something failed.
+ */
+ vre->vre_offset = vre->vre_failed_offset;
+ vre->vre_failed_offset = UINT64_MAX;
+ vre->vre_waiting_for_resilver = B_TRUE;
+ }
+ mutex_exit(&vre->vre_lock);
+ }
+}
+
+void
+spa_start_raidz_expansion_thread(spa_t *spa)
+{
+ ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
+ spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
+ spa_raidz_expand_thread_check, spa_raidz_expand_thread,
+ spa, defclsyspri);
+}
+
+void
+raidz_dtl_reassessed(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ if (spa->spa_raidz_expand != NULL) {
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ /*
+ * we get called often from vdev_dtl_reassess() so make
+ * sure it's our vdev and any replacing is complete
+ */
+ if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
+ !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
+ mutex_enter(&vre->vre_lock);
+ if (vre->vre_waiting_for_resilver) {
+ vdev_dbgmsg(vd, "DTL reassessed, "
+ "continuing raidz expansion");
+ vre->vre_waiting_for_resilver = B_FALSE;
+ zthr_wakeup(spa->spa_raidz_expand_zthr);
+ }
+ mutex_exit(&vre->vre_lock);
+ }
+ }
+}
+
+int
+vdev_raidz_attach_check(vdev_t *new_child)
+{
+ vdev_t *raidvd = new_child->vdev_parent;
+ uint64_t new_children = raidvd->vdev_children;
+
+ /*
+ * We use the "boot" space as scratch space to handle overwriting the
+ * initial part of the vdev. If it is too small, then this expansion
+ * is not allowed. This would be very unusual (e.g. ashift > 13 and
+ * >200 children).
+ */
+ if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
+ return (EINVAL);
+ }
+ return (0);
+}
+
+void
+vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *new_child = arg;
+ spa_t *spa = new_child->vdev_spa;
+ vdev_t *raidvd = new_child->vdev_parent;
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+ ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
+ ASSERT3P(raidvd->vdev_top, ==, raidvd);
+ ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
+ ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
+ ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
+ new_child);
+
+ spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
+
+ vdrz->vd_physical_width++;
+
+ VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
+ vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
+ vdrz->vn_vre.vre_offset = 0;
+ vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
+ spa->spa_raidz_expand = &vdrz->vn_vre;
+ zthr_wakeup(spa->spa_raidz_expand_zthr);
+
+ /*
+ * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
+ * written to the config.
+ */
+ vdev_config_dirty(raidvd);
+
+ vdrz->vn_vre.vre_start_time = gethrestime_sec();
+ vdrz->vn_vre.vre_end_time = 0;
+ vdrz->vn_vre.vre_state = DSS_SCANNING;
+ vdrz->vn_vre.vre_bytes_copied = 0;
+
+ uint64_t state = vdrz->vn_vre.vre_state;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+ sizeof (state), 1, &state, tx));
+
+ uint64_t start_time = vdrz->vn_vre.vre_start_time;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
+ sizeof (start_time), 1, &start_time, tx));
+
+ (void) zap_remove(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
+ (void) zap_remove(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
+
+ spa_history_log_internal(spa, "raidz vdev expansion started", tx,
+ "%s vdev %llu new width %llu", spa_name(spa),
+ (unsigned long long)raidvd->vdev_id,
+ (unsigned long long)raidvd->vdev_children);
+}
+
+int
+vdev_raidz_load(vdev_t *vd)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ int err;
+
+ uint64_t state = DSS_NONE;
+ uint64_t start_time = 0;
+ uint64_t end_time = 0;
+ uint64_t bytes_copied = 0;
+
+ if (vd->vdev_top_zap != 0) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+ sizeof (state), 1, &state);
+ if (err != 0 && err != ENOENT)
+ return (err);
+
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
+ sizeof (start_time), 1, &start_time);
+ if (err != 0 && err != ENOENT)
+ return (err);
+
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
+ sizeof (end_time), 1, &end_time);
+ if (err != 0 && err != ENOENT)
+ return (err);
+
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
+ sizeof (bytes_copied), 1, &bytes_copied);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ }
+
+ /*
+ * If we are in the middle of expansion, vre_state should have
+ * already been set by vdev_raidz_init().
+ */
+ EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
+ vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
+ vdrz->vn_vre.vre_start_time = start_time;
+ vdrz->vn_vre.vre_end_time = end_time;
+ vdrz->vn_vre.vre_bytes_copied = bytes_copied;
+
+ return (0);
+}
+
+int
+spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
+{
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+ if (vre == NULL) {
+ /* no removal in progress; find most recent completed */
+ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ if (vdrz->vn_vre.vre_end_time != 0 &&
+ (vre == NULL ||
+ vdrz->vn_vre.vre_end_time >
+ vre->vre_end_time)) {
+ vre = &vdrz->vn_vre;
+ }
+ }
+ }
+ }
+
+ if (vre == NULL) {
+ return (SET_ERROR(ENOENT));
+ }
+
+ pres->pres_state = vre->vre_state;
+ pres->pres_expanding_vdev = vre->vre_vdev_id;
+
+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
+
+ mutex_enter(&vre->vre_lock);
+ pres->pres_reflowed = vre->vre_bytes_copied;
+ for (int i = 0; i < TXG_SIZE; i++)
+ pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
+ mutex_exit(&vre->vre_lock);
+
+ pres->pres_start_time = vre->vre_start_time;
+ pres->pres_end_time = vre->vre_end_time;
+ pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
+
+ return (0);
+}
+
/*
* Initialize private RAIDZ specific fields from the nvlist.
*/
static int
vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
{
- vdev_raidz_t *vdrz;
- uint64_t nparity;
-
uint_t children;
nvlist_t **child;
int error = nvlist_lookup_nvlist_array(nv,
@@ -2541,6 +4820,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
if (error != 0)
return (SET_ERROR(EINVAL));
+ uint64_t nparity;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (SET_ERROR(EINVAL));
@@ -2567,10 +4847,56 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
nparity = 1;
}
- vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
- vdrz->vd_logical_width = children;
+ vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+ vdrz->vn_vre.vre_vdev_id = -1;
+ vdrz->vn_vre.vre_offset = UINT64_MAX;
+ vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
+ mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
+ zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
+ mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
+ sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
+
+ vdrz->vd_physical_width = children;
vdrz->vd_nparity = nparity;
+ /* note, the ID does not exist when creating a pool */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+ &vdrz->vn_vre.vre_vdev_id);
+
+ boolean_t reflow_in_progress =
+ nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
+ if (reflow_in_progress) {
+ spa->spa_raidz_expand = &vdrz->vn_vre;
+ vdrz->vn_vre.vre_state = DSS_SCANNING;
+ }
+
+ vdrz->vd_original_width = children;
+ uint64_t *txgs;
+ unsigned int txgs_size = 0;
+ error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
+ &txgs, &txgs_size);
+ if (error == 0) {
+ for (int i = 0; i < txgs_size; i++) {
+ reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
+ re->re_txg = txgs[txgs_size - i - 1];
+ re->re_logical_width = vdrz->vd_physical_width - i;
+
+ if (reflow_in_progress)
+ re->re_logical_width--;
+
+ avl_add(&vdrz->vd_expand_txgs, re);
+ }
+
+ vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
+ }
+ if (reflow_in_progress) {
+ vdrz->vd_original_width--;
+ zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
+ children, txgs_size);
+ }
+
*tsd = vdrz;
return (0);
@@ -2579,7 +4905,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
static void
vdev_raidz_fini(vdev_t *vd)
{
- kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
+ vd->vdev_spa->spa_raidz_expand = NULL;
+ reflow_node_t *re;
+ void *cookie = NULL;
+ avl_tree_t *tree = &vdrz->vd_expand_txgs;
+ while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
+ kmem_free(re, sizeof (*re));
+ avl_destroy(&vdrz->vd_expand_txgs);
+ mutex_destroy(&vdrz->vd_expand_lock);
+ mutex_destroy(&vdrz->vn_vre.vre_lock);
+ cv_destroy(&vdrz->vn_vre.vre_cv);
+ zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
+ kmem_free(vdrz, sizeof (*vdrz));
}
/*
@@ -2607,6 +4946,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
* it.
*/
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+ fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
+ }
+
+ mutex_enter(&vdrz->vd_expand_lock);
+ if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
+ uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
+ uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
+ KM_SLEEP);
+ uint64_t i = 0;
+
+ for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
+ re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
+ txgs[i++] = re->re_txg;
+ }
+
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
+ txgs, count);
+
+ kmem_free(txgs, sizeof (uint64_t) * count);
+ }
+ mutex_exit(&vdrz->vd_expand_lock);
}
static uint64_t
@@ -2646,3 +5008,15 @@ vdev_ops_t vdev_raidz_ops = {
.vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
+ "For testing, pause RAIDZ expansion after reflowing this many bytes");
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
+ "Max amount of concurrent i/o for RAIDZ expansion");
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
+ "For expanded RAIDZ, aggregate reads that have more rows than this");
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
+ "For expanded RAIDZ, automatically start a pool scrub when expansion "
+ "completes");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
index 03df2df5adaf..e12b96170f55 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -22,6 +22,7 @@
* Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/
+#include <sys/simd.h>
#include <sys/zfs_context.h>
#include <sys/types.h>
#include <sys/zio.h>
@@ -29,7 +30,6 @@
#include <sys/zfs_debug.h>
#include <sys/vdev_raidz.h>
#include <sys/vdev_raidz_impl.h>
-#include <sys/simd.h>
/* Opaque implementation with NULL methods to represent original methods */
static const raidz_impl_ops_t vdev_raidz_original_impl = {
@@ -43,7 +43,7 @@ static raidz_impl_ops_t vdev_raidz_fastest_impl = {
};
/* All compiled in implementations */
-const raidz_impl_ops_t *raidz_all_maths[] = {
+static const raidz_impl_ops_t *const raidz_all_maths[] = {
&vdev_raidz_original_impl,
&vdev_raidz_scalar_impl,
#if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */
@@ -268,10 +268,10 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
return (rec_fn(rr, dt));
}
-const char *raidz_gen_name[] = {
+const char *const raidz_gen_name[] = {
"gen_p", "gen_pq", "gen_pqr"
};
-const char *raidz_rec_name[] = {
+const char *const raidz_rec_name[] = {
"rec_p", "rec_q", "rec_r",
"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
};
@@ -283,22 +283,19 @@ const char *raidz_rec_name[] = {
static int
raidz_math_kstat_headers(char *buf, size_t size)
{
- int i;
- ssize_t off;
-
ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
- off = snprintf(buf, size, "%-17s", "implementation");
+ ssize_t off = kmem_scnprintf(buf, size, "%-17s", "implementation");
- for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
- off += snprintf(buf + off, size - off, "%-16s",
+ for (int i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
+ off += kmem_scnprintf(buf + off, size - off, "%-16s",
raidz_gen_name[i]);
- for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
- off += snprintf(buf + off, size - off, "%-16s",
+ for (int i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
+ off += kmem_scnprintf(buf + off, size - off, "%-16s",
raidz_rec_name[i]);
- (void) snprintf(buf + off, size - off, "\n");
+ (void) kmem_scnprintf(buf + off, size - off, "\n");
return (0);
}
@@ -314,34 +311,35 @@ raidz_math_kstat_data(char *buf, size_t size, void *data)
ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
if (cstat == fstat) {
- off += snprintf(buf + off, size - off, "%-17s", "fastest");
+ off += kmem_scnprintf(buf + off, size - off, "%-17s",
+ "fastest");
for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) {
int id = fstat->gen[i];
- off += snprintf(buf + off, size - off, "%-16s",
+ off += kmem_scnprintf(buf + off, size - off, "%-16s",
raidz_supp_impl[id]->name);
}
for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) {
int id = fstat->rec[i];
- off += snprintf(buf + off, size - off, "%-16s",
+ off += kmem_scnprintf(buf + off, size - off, "%-16s",
raidz_supp_impl[id]->name);
}
} else {
ptrdiff_t id = cstat - raidz_impl_kstats;
- off += snprintf(buf + off, size - off, "%-17s",
+ off += kmem_scnprintf(buf + off, size - off, "%-17s",
raidz_supp_impl[id]->name);
for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
- off += snprintf(buf + off, size - off, "%-16llu",
+ off += kmem_scnprintf(buf + off, size - off, "%-16llu",
(u_longlong_t)cstat->gen[i]);
for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
- off += snprintf(buf + off, size - off, "%-16llu",
+ off += kmem_scnprintf(buf + off, size - off, "%-16llu",
(u_longlong_t)cstat->rec[i]);
}
- (void) snprintf(buf + off, size - off, "\n");
+ (void) kmem_scnprintf(buf + off, size - off, "\n");
return (0);
}
@@ -566,7 +564,7 @@ vdev_raidz_math_fini(void)
}
static const struct {
- char *name;
+ const char *name;
uint32_t sel;
} math_impl_opts[] = {
{ "cycle", IMPL_CYCLE },
@@ -655,13 +653,15 @@ zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
/* list mandatory options */
for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
- cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
+ cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
+ math_impl_opts[i].name);
}
/* list all supported implementations */
for (i = 0; i < raidz_supp_impl_cnt; i++) {
fmt = (i == impl) ? "[%s] " : "%s ";
- cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
+ cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
+ raidz_supp_impl[i]->name);
}
return (cnt);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
index 0a67ceb84920..4aa7bc2b9708 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
index e46b2536546c..f0f6546f7f71 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
index e072f51cd635..bd9de91a4ba8 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -210,9 +210,13 @@ DEFINE_GEN_METHODS(aarch64_neonx2);
* If compiled with -O0, gcc doesn't do any stack frame coalescing
* and -Wframe-larger-than=1024 is triggered in debug mode.
*/
+#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
DEFINE_REC_METHODS(aarch64_neonx2);
+#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
+#endif
static boolean_t
raidz_will_aarch64_neonx2_work(void)
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
index 65e4bebce8fa..e5bbc7decbfa 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
index f06b469023eb..3b709ed34fc4 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
index aab653b77491..5ec71a04133a 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
index 35e016fc65a5..5d77c5d046d5 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -214,9 +214,10 @@ raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private)
}
-#define raidz_copy(dabd, sabd, size) \
+#define raidz_copy(dabd, sabd, off, size) \
{ \
- abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\
+ abd_iterate_func2(dabd, sabd, off, off, size, raidz_copy_abd_cb, \
+ NULL); \
}
/*
@@ -254,9 +255,10 @@ raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private)
return (0);
}
-#define raidz_add(dabd, sabd, size) \
+#define raidz_add(dabd, sabd, off, size) \
{ \
- abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\
+ abd_iterate_func2(dabd, sabd, off, off, size, raidz_add_abd_cb, \
+ NULL); \
}
/*
@@ -343,7 +345,10 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
* the parity/syndrome if data column is shorter.
*
* P parity is calculated using raidz_add_abd().
+ *
+ * For CPU L2 cache blocking we process 64KB at a time.
*/
+#define CHUNK 65536
/*
* Generate P parity (RAIDZ1)
@@ -357,20 +362,26 @@ raidz_generate_p_impl(raidz_row_t * const rr)
const size_t ncols = rr->rr_cols;
const size_t psize = rr->rr_col[CODE_P].rc_size;
abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
- size_t size;
- abd_t *dabd;
+ size_t off, size;
raidz_math_begin();
- /* start with first data column */
- raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
+ for (off = 0; off < psize; off += CHUNK) {
+
+ /* start with first data column */
+ size = MIN(CHUNK, psize - off);
+ raidz_copy(pabd, rr->rr_col[1].rc_abd, off, size);
- for (c = 2; c < ncols; c++) {
- dabd = rr->rr_col[c].rc_abd;
- size = rr->rr_col[c].rc_size;
+ for (c = 2; c < ncols; c++) {
+ size = rr->rr_col[c].rc_size;
+ if (size <= off)
+ continue;
- /* add data column */
- raidz_add(pabd, dabd, size);
+ /* add data column */
+ size = MIN(CHUNK, size - off);
+ abd_t *dabd = rr->rr_col[c].rc_abd;
+ raidz_add(pabd, dabd, off, size);
+ }
}
raidz_math_end();
@@ -423,7 +434,7 @@ raidz_generate_pq_impl(raidz_row_t * const rr)
size_t c;
const size_t ncols = rr->rr_cols;
const size_t csize = rr->rr_col[CODE_P].rc_size;
- size_t dsize;
+ size_t off, size, dsize;
abd_t *dabd;
abd_t *cabds[] = {
rr->rr_col[CODE_P].rc_abd,
@@ -432,15 +443,20 @@ raidz_generate_pq_impl(raidz_row_t * const rr)
raidz_math_begin();
- raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
- raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
+ for (off = 0; off < csize; off += CHUNK) {
- for (c = 3; c < ncols; c++) {
- dabd = rr->rr_col[c].rc_abd;
- dsize = rr->rr_col[c].rc_size;
+ size = MIN(CHUNK, csize - off);
+ raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, off, size);
+ raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, off, size);
- abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
- raidz_gen_pq_add);
+ for (c = 3; c < ncols; c++) {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0;
+
+ abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 2,
+ raidz_gen_pq_add);
+ }
}
raidz_math_end();
@@ -460,8 +476,8 @@ static void
raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
const size_t dsize)
{
- v_t *p = (v_t *)c[0];
- v_t *q = (v_t *)c[1];
+ v_t *p = (v_t *)c[CODE_P];
+ v_t *q = (v_t *)c[CODE_Q];
v_t *r = (v_t *)c[CODE_R];
const v_t *d = (const v_t *)dc;
const v_t * const dend = d + (dsize / sizeof (v_t));
@@ -486,7 +502,7 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
/*
- * Generate PQR parity (RAIDZ2)
+ * Generate PQR parity (RAIDZ3)
*
* @rr RAIDZ row
*/
@@ -496,7 +512,7 @@ raidz_generate_pqr_impl(raidz_row_t * const rr)
size_t c;
const size_t ncols = rr->rr_cols;
const size_t csize = rr->rr_col[CODE_P].rc_size;
- size_t dsize;
+ size_t off, size, dsize;
abd_t *dabd;
abd_t *cabds[] = {
rr->rr_col[CODE_P].rc_abd,
@@ -506,16 +522,21 @@ raidz_generate_pqr_impl(raidz_row_t * const rr)
raidz_math_begin();
- raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
- raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
- raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
+ for (off = 0; off < csize; off += CHUNK) {
- for (c = 4; c < ncols; c++) {
- dabd = rr->rr_col[c].rc_abd;
- dsize = rr->rr_col[c].rc_size;
+ size = MIN(CHUNK, csize - off);
+ raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, off, size);
+ raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, off, size);
+ raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, off, size);
- abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
- raidz_gen_pqr_add);
+ for (c = 4; c < ncols; c++) {
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
+ dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0;
+
+ abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 3,
+ raidz_gen_pqr_add);
+ }
}
raidz_math_end();
@@ -592,26 +613,31 @@ raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
const size_t x = tgtidx[TARGET_X];
const size_t xsize = rr->rr_col[x].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
- size_t size;
- abd_t *dabd;
+ size_t off, size;
if (xabd == NULL)
return (1 << CODE_P);
raidz_math_begin();
- /* copy P into target */
- raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
+ for (off = 0; off < xsize; off += CHUNK) {
- /* generate p_syndrome */
- for (c = firstdc; c < ncols; c++) {
- if (c == x)
- continue;
+ /* copy P into target */
+ size = MIN(CHUNK, xsize - off);
+ raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, off, size);
- dabd = rr->rr_col[c].rc_abd;
- size = MIN(rr->rr_col[c].rc_size, xsize);
+ /* generate p_syndrome */
+ for (c = firstdc; c < ncols; c++) {
+ if (c == x)
+ continue;
+ size = rr->rr_col[c].rc_size;
+ if (size <= off)
+ continue;
- raidz_add(xabd, dabd, size);
+ size = MIN(CHUNK, MIN(size, xsize) - off);
+ abd_t *dabd = rr->rr_col[c].rc_abd;
+ raidz_add(xabd, dabd, off, size);
+ }
}
raidz_math_end();
@@ -683,7 +709,7 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
} else {
raidz_zero(xabd, xsize);
}
@@ -698,12 +724,12 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
dsize = rr->rr_col[c].rc_size;
}
- abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1,
raidz_syn_q_abd);
}
/* add Q to the syndrome */
- raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
+ raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, 0, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
@@ -777,7 +803,7 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
} else {
raidz_zero(xabd, xsize);
}
@@ -793,12 +819,12 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
dsize = rr->rr_col[c].rc_size;
}
- abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1,
raidz_syn_r_abd);
}
/* add R to the syndrome */
- raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
+ raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, 0, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
@@ -934,8 +960,8 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -951,7 +977,7 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
dsize = rr->rr_col[c].rc_size;
}
- abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
raidz_syn_pq_abd);
}
@@ -959,7 +985,7 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
/* Copy shorter targets back to the original abd buffer */
if (ysize < xsize)
- raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
raidz_math_end();
@@ -1094,8 +1120,8 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1111,7 +1137,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
dsize = rr->rr_col[c].rc_size;
}
- abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
raidz_syn_pr_abd);
}
@@ -1121,7 +1147,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
- raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
raidz_math_end();
@@ -1261,8 +1287,8 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1278,7 +1304,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
dsize = rr->rr_col[c].rc_size;
}
- abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
raidz_syn_qr_abd);
}
@@ -1288,7 +1314,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
- raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
raidz_math_end();
@@ -1456,9 +1482,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
- raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
+ raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1475,7 +1501,7 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
dsize = rr->rr_col[c].rc_size;
}
- abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
+ abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 3,
raidz_syn_pqr_abd);
}
@@ -1485,9 +1511,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
- raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
if (zsize < xsize)
- raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
+ raidz_copy(rr->rr_col[z].rc_abd, zabd, 0, zsize);
raidz_math_end();
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
index 1db2c4cd3a47..ff493b8b7bc0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
index 3842f5fd637c..f76eb47a9c66 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -26,10 +26,6 @@
#include <sys/types.h>
#include <sys/simd.h>
-#ifdef __linux__
-#define __asm __asm__ __volatile__
-#endif
-
#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
@@ -142,7 +138,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 8: \
- __asm( \
+ __asm__ __volatile__( \
"lvx 21,0,%[SRC0]\n" \
"lvx 20,0,%[SRC1]\n" \
"lvx 19,0,%[SRC2]\n" \
@@ -172,7 +168,7 @@ typedef struct v {
: "v18", "v19", "v20", "v21"); \
break; \
case 4: \
- __asm( \
+ __asm__ __volatile__( \
"lvx 21,0,%[SRC0]\n" \
"lvx 20,0,%[SRC1]\n" \
"lvx 19,0,%[SRC2]\n" \
@@ -189,7 +185,7 @@ typedef struct v {
: "v18", "v19", "v20", "v21"); \
break; \
case 2: \
- __asm( \
+ __asm__ __volatile__( \
"lvx 21,0,%[SRC0]\n" \
"lvx 20,0,%[SRC1]\n" \
"vxor " VR0(r) "," VR0(r) ",21\n" \
@@ -208,7 +204,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 8: \
- __asm( \
+ __asm__ __volatile__( \
"vxor " VR4(r) "," VR4(r) "," VR0(r) "\n" \
"vxor " VR5(r) "," VR5(r) "," VR1(r) "\n" \
"vxor " VR6(r) "," VR6(r) "," VR2(r) "\n" \
@@ -217,7 +213,7 @@ typedef struct v {
: RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
break; \
case 4: \
- __asm( \
+ __asm__ __volatile__( \
"vxor " VR2(r) "," VR2(r) "," VR0(r) "\n" \
"vxor " VR3(r) "," VR3(r) "," VR1(r) "\n" \
: UVR2(r), UVR3(r) \
@@ -232,7 +228,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 8: \
- __asm( \
+ __asm__ __volatile__( \
"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \
"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \
"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \
@@ -245,7 +241,7 @@ typedef struct v {
WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \
break; \
case 4: \
- __asm( \
+ __asm__ __volatile__( \
"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \
"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \
"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \
@@ -253,7 +249,7 @@ typedef struct v {
: WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \
break; \
case 2: \
- __asm( \
+ __asm__ __volatile__( \
"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \
"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \
: WVR0(r), WVR1(r)); \
@@ -267,7 +263,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 8: \
- __asm( \
+ __asm__ __volatile__( \
"vor " VR4(r) "," VR0(r) "," VR0(r) "\n" \
"vor " VR5(r) "," VR1(r) "," VR1(r) "\n" \
"vor " VR6(r) "," VR2(r) "," VR2(r) "\n" \
@@ -276,7 +272,7 @@ typedef struct v {
: RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
break; \
case 4: \
- __asm( \
+ __asm__ __volatile__( \
"vor " VR2(r) "," VR0(r) "," VR0(r) "\n" \
"vor " VR3(r) "," VR1(r) "," VR1(r) "\n" \
: WVR2(r), WVR3(r) \
@@ -291,7 +287,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 8: \
- __asm( \
+ __asm__ __volatile__( \
"lvx " VR0(r) " ,0,%[SRC0]\n" \
"lvx " VR1(r) " ,0,%[SRC1]\n" \
"lvx " VR2(r) " ,0,%[SRC2]\n" \
@@ -312,7 +308,7 @@ typedef struct v {
[SRC7] "r" ((OFFSET(src, 112)))); \
break; \
case 4: \
- __asm( \
+ __asm__ __volatile__( \
"lvx " VR0(r) " ,0,%[SRC0]\n" \
"lvx " VR1(r) " ,0,%[SRC1]\n" \
"lvx " VR2(r) " ,0,%[SRC2]\n" \
@@ -324,7 +320,7 @@ typedef struct v {
[SRC3] "r" ((OFFSET(src, 48)))); \
break; \
case 2: \
- __asm( \
+ __asm__ __volatile__( \
"lvx " VR0(r) " ,0,%[SRC0]\n" \
"lvx " VR1(r) " ,0,%[SRC1]\n" \
: WVR0(r), WVR1(r) \
@@ -340,7 +336,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 8: \
- __asm( \
+ __asm__ __volatile__( \
"stvx " VR0(r) " ,0,%[DST0]\n" \
"stvx " VR1(r) " ,0,%[DST1]\n" \
"stvx " VR2(r) " ,0,%[DST2]\n" \
@@ -362,7 +358,7 @@ typedef struct v {
: "memory"); \
break; \
case 4: \
- __asm( \
+ __asm__ __volatile__( \
"stvx " VR0(r) " ,0,%[DST0]\n" \
"stvx " VR1(r) " ,0,%[DST1]\n" \
"stvx " VR2(r) " ,0,%[DST2]\n" \
@@ -375,7 +371,7 @@ typedef struct v {
: "memory"); \
break; \
case 2: \
- __asm( \
+ __asm__ __volatile__( \
"stvx " VR0(r) " ,0,%[DST0]\n" \
"stvx " VR1(r) " ,0,%[DST1]\n" \
: : [DST0] "r" ((OFFSET(dst, 0))), \
@@ -400,7 +396,7 @@ typedef struct v {
#define MUL2_SETUP() \
{ \
- __asm( \
+ __asm__ __volatile__( \
"vspltisb " VR(16) ",14\n" \
"vspltisb " VR(17) ",15\n" \
"vaddubm " VR(16) "," VR(17) "," VR(16) "\n" \
@@ -412,7 +408,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 4: \
- __asm( \
+ __asm__ __volatile__( \
"vcmpgtsb 19," VR(17) "," VR0(r) "\n" \
"vcmpgtsb 18," VR(17) "," VR1(r) "\n" \
"vcmpgtsb 21," VR(17) "," VR2(r) "\n" \
@@ -434,7 +430,7 @@ typedef struct v {
: "v18", "v19", "v20", "v21"); \
break; \
case 2: \
- __asm( \
+ __asm__ __volatile__( \
"vcmpgtsb 19," VR(17) "," VR0(r) "\n" \
"vcmpgtsb 18," VR(17) "," VR1(r) "\n" \
"vand 19,19," VR(16) "\n" \
@@ -478,7 +474,7 @@ typedef struct v {
{ \
switch (REG_CNT(r)) { \
case 2: \
- __asm( \
+ __asm__ __volatile__( \
/* lts for upper part */ \
"vspltisb 15,15\n" \
"lvx 10,0,%[lt0]\n" \
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
index 9e9c15ff4ba2..b51352b4e90b 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -142,7 +142,7 @@ static const struct {
a.b[6] = mul_lt[a.b[6]]; \
a.b[5] = mul_lt[a.b[5]]; \
a.b[4] = mul_lt[a.b[4]]; \
- fallthrough; \
+ zfs_fallthrough; \
case 4: \
a.b[3] = mul_lt[a.b[3]]; \
a.b[2] = mul_lt[a.b[2]]; \
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
index 56a0b123d952..02b5d6a609ab 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
index 5ddc079a4f5d..244f137b3d09 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
index 4d7de0c6c44c..8a8b02cab5c6 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -22,6 +22,8 @@
*
* Copyright (c) 2018, Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2024 by Delphix. All rights reserved.
*/
#include <sys/vdev_impl.h>
@@ -33,6 +35,7 @@
#include <sys/zio.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
+#include <sys/arc_impl.h>
#include <sys/zap.h>
/*
@@ -103,7 +106,7 @@
* Size of rebuild reads; defaults to 1MiB per data disk and is capped at
* SPA_MAXBLOCKSIZE.
*/
-unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+static uint64_t zfs_rebuild_max_segment = 1024 * 1024;
/*
* Maximum number of parallelly executed bytes per leaf vdev caused by a
@@ -115,25 +118,25 @@ unsigned long zfs_rebuild_max_segment = 1024 * 1024;
* segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
* the queue depth short.
*
- * 32MB was selected as the default value to achieve good performance with
- * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
- * rebuild was unable to saturate all of the drives using smaller values.
- * With a value of 32MB the sequential resilver write rate was measured at
- * 800MB/s sustained while rebuilding to a distributed spare.
+ * 64MB was observed to deliver the best performance and set as the default.
+ * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c)
+ * and a rebuild rate of 1.2GB/s was measured to the distribute spare.
+ * Smaller values were unable to fully saturate the available pool I/O.
*/
-unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+static uint64_t zfs_rebuild_vdev_limit = 64 << 20;
/*
* Automatically start a pool scrub when the last active sequential resilver
* completes in order to verify the checksums of all blocks which have been
* resilvered. This option is enabled by default and is strongly recommended.
*/
-int zfs_rebuild_scrub_enabled = 1;
+static int zfs_rebuild_scrub_enabled = 1;
/*
* For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
*/
-static void vdev_rebuild_thread(void *arg);
+static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg);
+static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx);
/*
* Clear the per-vdev rebuild bytes value for a vdev tree.
@@ -227,7 +230,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
mutex_enter(&vd->vdev_rebuild_lock);
- bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
vrp->vrp_min_txg = 0;
vrp->vrp_max_txg = dmu_tx_get_txg(tx);
@@ -260,7 +263,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
}
static void
-vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name)
+vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, const char *name)
{
nvlist_t *aux = fnvlist_alloc();
@@ -307,6 +310,17 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
mutex_enter(&vd->vdev_rebuild_lock);
+
+ /*
+ * Handle a second device failure if it occurs after all rebuild I/O
+ * has completed but before this sync task has been executed.
+ */
+ if (vd->vdev_rebuild_reset_wanted) {
+ mutex_exit(&vd->vdev_rebuild_lock);
+ vdev_rebuild_reset_sync(arg, tx);
+ return;
+ }
+
vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
vrp->vrp_end_time = gethrestime_sec();
@@ -448,7 +462,7 @@ vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx)
}
clear_rebuild_bytes(vd);
- bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap,
VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) {
@@ -558,8 +572,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
vdev_rebuild_blkptr_init(&blk, vd, start, size);
uint64_t psize = BP_GET_PSIZE(&blk);
- if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+ if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
+ vr->vr_pass_bytes_skipped += size;
return (0);
+ }
mutex_enter(&vr->vr_io_lock);
@@ -701,7 +717,7 @@ vdev_rebuild_load(vdev_t *vd)
vd->vdev_rebuilding = B_FALSE;
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) {
- bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
mutex_exit(&vd->vdev_rebuild_lock);
return (SET_ERROR(ENOTSUP));
}
@@ -718,7 +734,7 @@ vdev_rebuild_load(vdev_t *vd)
* status allowing a new resilver/rebuild to be started.
*/
if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) {
- bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+ memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
} else if (err) {
mutex_exit(&vd->vdev_rebuild_lock);
return (err);
@@ -736,11 +752,12 @@ vdev_rebuild_load(vdev_t *vd)
* Each scan thread is responsible for rebuilding a top-level vdev. The
* rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS.
*/
-static void
+static __attribute__((noreturn)) void
vdev_rebuild_thread(void *arg)
{
vdev_t *vd = arg;
spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
int error = 0;
/*
@@ -760,7 +777,6 @@ vdev_rebuild_thread(void *arg)
ASSERT(vd->vdev_rebuilding);
ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
- ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE);
vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
@@ -773,9 +789,7 @@ vdev_rebuild_thread(void *arg)
vr->vr_pass_start_time = gethrtime();
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
-
- vr->vr_bytes_inflight_max = MAX(1ULL << 20,
- zfs_rebuild_vdev_limit * vd->vdev_children);
+ vr->vr_pass_bytes_skipped = 0;
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@@ -793,6 +807,17 @@ vdev_rebuild_thread(void *arg)
vr->vr_scan_msp = msp;
/*
+ * Calculate the max number of in-flight bytes for top-level
+ * vdev scanning operations (minimum 1MB, maximum 1/2 of
+ * arc_c_max shared by all top-level vdevs). Limits for the
+ * issuing phase are done per top-level vdev and are handled
+ * separately.
+ */
+ uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1);
+ vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20,
+ zfs_rebuild_vdev_limit * vd->vdev_children));
+
+ /*
* Removal of vdevs from the vdev tree may eliminate the need
* for the rebuild, in which case it should be canceled. The
* vdev_rebuild_cancel_wanted flag is set until the sync task
@@ -1047,7 +1072,8 @@ vdev_rebuild_restart_impl(vdev_t *vd)
void
vdev_rebuild_restart(spa_t *spa)
{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_load_thread == curthread);
vdev_rebuild_restart_impl(spa->spa_root_vdev);
}
@@ -1061,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
if (vd == spa->spa_root_vdev) {
for (uint64_t i = 0; i < vd->vdev_children; i++)
@@ -1111,7 +1138,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS);
if (error == ENOENT) {
- bzero(vrs, sizeof (vdev_rebuild_stat_t));
+ memset(vrs, 0, sizeof (vdev_rebuild_stat_t));
vrs->vrs_state = VDEV_REBUILD_NONE;
error = 0;
} else if (error == 0) {
@@ -1132,19 +1159,18 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
vr->vr_pass_start_time);
vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+ vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
mutex_exit(&tvd->vdev_rebuild_lock);
}
return (error);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW,
"Max segment size in bytes of rebuild reads");
-ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW,
"Max bytes in flight per leaf vdev for sequential resilvers");
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
"Automatically scrub after sequential resilver completes");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index f762c1df96aa..1249657f9d72 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -94,7 +94,7 @@ typedef struct vdev_copy_arg {
* doing a device removal. This determines how much i/o we can have
* in flight concurrently.
*/
-int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
+static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
/*
* The largest contiguous segment that we will attempt to allocate when
@@ -104,7 +104,7 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
*
* See also the accessor function spa_remove_max_segment().
*/
-int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
/*
* Ignore hard IO errors during device removal. When set if a device
@@ -112,7 +112,7 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
* not be cancelled. This can result in a normally recoverable block
* becoming permanently damaged and is not recommended.
*/
-int zfs_removal_ignore_errors = 0;
+static int zfs_removal_ignore_errors = 0;
/*
* Allow a remap segment to span free chunks of at most this size. The main
@@ -130,7 +130,7 @@ int zfs_removal_ignore_errors = 0;
* - we'll do larger allocations, which may fail and fall back on smaller
* allocations
*/
-int vdev_removal_max_span = 32 * 1024;
+uint_t vdev_removal_max_span = 32 * 1024;
/*
* This is used by the test suite so that it can ensure that certain
@@ -140,7 +140,7 @@ int zfs_removal_suspend_progress = 0;
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
-static void spa_vdev_remove_thread(void *arg);
+static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg);
static int spa_vdev_remove_cancel_impl(spa_t *spa);
static void
@@ -168,8 +168,178 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
}
static void
-spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
- nvlist_t *dev_to_remove)
+vdev_activate(vdev_t *vd)
+{
+ metaslab_group_t *mg = vd->vdev_mg;
+ spa_t *spa = vd->vdev_spa;
+ uint64_t vdev_space = spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+
+ ASSERT(!vd->vdev_islog);
+ ASSERT(vd->vdev_noalloc);
+
+ metaslab_group_activate(mg);
+ metaslab_group_activate(vd->vdev_log_mg);
+
+ ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
+
+ spa->spa_nonallocating_dspace -= vdev_space;
+
+ vd->vdev_noalloc = B_FALSE;
+}
+
+static int
+vdev_passivate(vdev_t *vd, uint64_t *txg)
+{
+ spa_t *spa = vd->vdev_spa;
+ int error;
+
+ ASSERT(!vd->vdev_noalloc);
+
+ vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_class_t *normal = spa_normal_class(spa);
+ if (mg->mg_class == normal) {
+ /*
+ * We must check that this is not the only allocating device in
+ * the pool before passivating, otherwise we will not be able
+ * to make progress because we can't allocate from any vdevs.
+ */
+ boolean_t last = B_TRUE;
+ for (uint64_t id = 0; id < rvd->vdev_children; id++) {
+ vdev_t *cvd = rvd->vdev_child[id];
+
+ if (cvd == vd ||
+ cvd->vdev_ops == &vdev_indirect_ops)
+ continue;
+
+ metaslab_class_t *mc = cvd->vdev_mg->mg_class;
+ if (mc != normal)
+ continue;
+
+ if (!cvd->vdev_noalloc) {
+ last = B_FALSE;
+ break;
+ }
+ }
+ if (last)
+ return (SET_ERROR(EINVAL));
+ }
+
+ metaslab_group_passivate(mg);
+ ASSERT(!vd->vdev_islog);
+ metaslab_group_passivate(vd->vdev_log_mg);
+
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+ /*
+ * We must ensure that no "stubby" log blocks are allocated
+ * on the device to be removed. These blocks could be
+ * written at any time, including while we are in the middle
+ * of copying them.
+ */
+ error = spa_reset_logs(spa);
+
+ *txg = spa_vdev_config_enter(spa);
+
+ if (error != 0) {
+ metaslab_group_activate(mg);
+ ASSERT(!vd->vdev_islog);
+ if (vd->vdev_log_mg != NULL)
+ metaslab_group_activate(vd->vdev_log_mg);
+ return (error);
+ }
+
+ spa->spa_nonallocating_dspace += spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ vd->vdev_noalloc = B_TRUE;
+
+ return (0);
+}
+
+/*
+ * Turn off allocations for a top-level device from the pool.
+ *
+ * Turning off allocations for a top-level device can take a significant
+ * amount of time. As a result we use the spa_vdev_config_[enter/exit]
+ * functions which allow us to grab and release the spa_config_lock while
+ * still holding the namespace lock. During each step the configuration
+ * is synced out.
+ */
+int
+spa_vdev_noalloc(spa_t *spa, uint64_t guid)
+{
+ vdev_t *vd;
+ uint64_t txg;
+ int error = 0;
+
+ ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ if (vd == NULL)
+ error = SET_ERROR(ENOENT);
+ else if (vd->vdev_mg == NULL)
+ error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
+ else if (!vd->vdev_noalloc)
+ error = vdev_passivate(vd, &txg);
+
+ if (error == 0) {
+ vdev_dirty_leaves(vd, VDD_DTL, txg);
+ vdev_config_dirty(vd);
+ }
+
+ error = spa_vdev_exit(spa, NULL, txg, error);
+
+ return (error);
+}
+
+int
+spa_vdev_alloc(spa_t *spa, uint64_t guid)
+{
+ vdev_t *vd;
+ uint64_t txg;
+ int error = 0;
+
+ ASSERT(!MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(spa_writeable(spa));
+
+ txg = spa_vdev_enter(spa);
+
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+ if (vd == NULL)
+ error = SET_ERROR(ENOENT);
+ else if (vd->vdev_mg == NULL)
+ error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
+ else if (!vd->vdev_removing)
+ vdev_activate(vd);
+
+ if (error == 0) {
+ vdev_dirty_leaves(vd, VDD_DTL, txg);
+ vdev_config_dirty(vd);
+ }
+
+ (void) spa_vdev_exit(spa, NULL, txg, error);
+
+ return (error);
+}
+
+static void
+spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev,
+ int count, nvlist_t *dev_to_remove)
{
nvlist_t **newdev = NULL;
@@ -183,7 +353,8 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
}
VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
- VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
+ fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev,
+ count - 1);
for (int i = 0; i < count - 1; i++)
nvlist_free(newdev[i]);
@@ -997,11 +1168,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
metaslab_class_t *mc = mg->mg_class;
if (mc->mc_groups == 0)
mc = spa_normal_class(spa);
- int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
- zal, 0);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg,
+ METASLAB_DONT_THROTTLE, zal, 0);
if (error == ENOSPC && mc != spa_normal_class(spa)) {
error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
- &dst, 0, NULL, txg, 0, zal, 0);
+ &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0);
}
if (error != 0)
return (error);
@@ -1193,6 +1364,10 @@ vdev_remove_complete(spa_t *spa)
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+ vdev_rebuild_stop_wait(vd);
+ ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+ uint64_t vdev_space = spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
sysevent_t *ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1200,6 +1375,12 @@ vdev_remove_complete(spa_t *spa)
zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
(u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
+ ASSERT3U(0, !=, vdev_space);
+ ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
+
+ /* the vdev is no longer part of the dspace */
+ spa->spa_nonallocating_dspace -= vdev_space;
+
/*
* Discard allocation state.
*/
@@ -1207,7 +1388,6 @@ vdev_remove_complete(spa_t *spa)
vdev_metaslab_fini(vd);
metaslab_group_destroy(vd->vdev_mg);
vd->vdev_mg = NULL;
- spa_log_sm_set_blocklimit(spa);
}
if (vd->vdev_log_mg != NULL) {
ASSERT0(vd->vdev_ms_count);
@@ -1410,7 +1590,7 @@ spa_remove_max_segment(spa_t *spa)
* TXG have completed (see spa_txg_zio) and writes the new mappings to disk
* (see vdev_mapping_sync()).
*/
-static void
+static __attribute__((noreturn)) void
spa_vdev_remove_thread(void *arg)
{
spa_t *spa = arg;
@@ -1619,10 +1799,32 @@ spa_vdev_remove_suspend(spa_t *spa)
mutex_exit(&svr->svr_lock);
}
-/* ARGSUSED */
+/*
+ * Return true if the "allocating" property has been set to "off"
+ */
+static boolean_t
+vdev_prop_allocating_off(vdev_t *vd)
+{
+ uint64_t objid = vd->vdev_top_zap;
+ uint64_t allocating = 1;
+
+ /* no vdev property object => no props */
+ if (objid != 0) {
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+
+ mutex_enter(&spa->spa_props_lock);
+ (void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
+ 1, &allocating);
+ mutex_exit(&spa->spa_props_lock);
+ }
+ return (allocating == 0);
+}
+
static int
spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
if (spa->spa_vdev_removal == NULL)
@@ -1634,10 +1836,10 @@ spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
* Cancel a removal by freeing all entries from the partial mapping
* and marking the vdev as no longer being removing.
*/
-/* ARGSUSED */
static void
spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
{
+ (void) arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
@@ -1761,6 +1963,13 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
spa_finish_removal(spa, DSS_CANCELED, tx);
vd->vdev_removing = B_FALSE;
+
+ if (!vdev_prop_allocating_off(vd)) {
+ spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
+ vdev_activate(vd);
+ spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
+ }
+
vdev_config_dirty(vd);
zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
@@ -1774,21 +1983,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
static int
spa_vdev_remove_cancel_impl(spa_t *spa)
{
- uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
-
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
spa_vdev_remove_cancel_sync, NULL, 0,
ZFS_SPACE_CHECK_EXTRA_RESERVED);
-
- if (error == 0) {
- spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
- vdev_t *vd = vdev_lookup_top(spa, vdid);
- metaslab_group_activate(vd->vdev_mg);
- ASSERT(!vd->vdev_islog);
- metaslab_group_activate(vd->vdev_log_mg);
- spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
- }
-
return (error);
}
@@ -1935,7 +2132,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
* metaslab_class_histogram_verify()
*/
vdev_metaslab_fini(vd);
- spa_log_sm_set_blocklimit(spa);
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
*txg = spa_vdev_config_enter(spa);
@@ -1984,6 +2180,11 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
+ /*
+ * This device is already being removed
+ */
+ if (vd->vdev_removing)
+ return (SET_ERROR(EALREADY));
metaslab_class_t *mc = vd->vdev_mg->mg_class;
metaslab_class_t *normal = spa_normal_class(spa);
@@ -2002,20 +2203,12 @@ spa_vdev_remove_top_check(vdev_t *vd)
ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
if (available < vd->vdev_stat.vs_alloc)
return (SET_ERROR(ENOSPC));
- } else {
+ } else if (!vd->vdev_noalloc) {
/* available space in the pool's normal class */
uint64_t available = dsl_dir_space_available(
spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
- if (available <
- vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
- /*
- * This is a normal device. There has to be enough free
- * space to remove the device and leave double the
- * "slop" space (i.e. we must leave at least 3% of the
- * pool free, in addition to the normal slop space).
- */
+ if (available < vd->vdev_stat.vs_dspace)
return (SET_ERROR(ENOSPC));
- }
}
/*
@@ -2058,7 +2251,6 @@ spa_vdev_remove_top_check(vdev_t *vd)
* and not be raidz or draid.
*/
vdev_t *rvd = spa->spa_root_vdev;
- int num_indirect = 0;
for (uint64_t id = 0; id < rvd->vdev_children; id++) {
vdev_t *cvd = rvd->vdev_child[id];
@@ -2074,8 +2266,6 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (cvd->vdev_ashift != 0 &&
cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
- if (cvd->vdev_ops == &vdev_indirect_ops)
- num_indirect++;
if (!vdev_is_concrete(cvd))
continue;
if (vdev_get_nparity(cvd) != 0)
@@ -2108,6 +2298,7 @@ static int
spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
{
spa_t *spa = vd->vdev_spa;
+ boolean_t set_noalloc = B_FALSE;
int error;
/*
@@ -2116,8 +2307,6 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
* are errors.
*/
error = spa_vdev_remove_top_check(vd);
- if (error != 0)
- return (error);
/*
* Stop allocating from this vdev. Note that we must check
@@ -2127,31 +2316,22 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
* The above check for sufficient free space serves this
* purpose.
*/
- metaslab_group_t *mg = vd->vdev_mg;
- metaslab_group_passivate(mg);
- ASSERT(!vd->vdev_islog);
- metaslab_group_passivate(vd->vdev_log_mg);
-
- /*
- * Wait for the youngest allocations and frees to sync,
- * and then wait for the deferral of those frees to finish.
- */
- spa_vdev_config_exit(spa, NULL,
- *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+ if (error == 0 && !vd->vdev_noalloc) {
+ set_noalloc = B_TRUE;
+ error = vdev_passivate(vd, txg);
+ }
- /*
- * We must ensure that no "stubby" log blocks are allocated
- * on the device to be removed. These blocks could be
- * written at any time, including while we are in the middle
- * of copying them.
- */
- error = spa_reset_logs(spa);
+ if (error != 0)
+ return (error);
/*
* We stop any initializing and TRIM that is currently in progress
* but leave the state as "active". This will allow the process to
* resume if the removal is canceled sometime later.
*/
+
+ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+
vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_wait(vd);
@@ -2162,13 +2342,11 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
* Things might have changed while the config lock was dropped
* (e.g. space usage). Check for errors again.
*/
- if (error == 0)
- error = spa_vdev_remove_top_check(vd);
+ error = spa_vdev_remove_top_check(vd);
if (error != 0) {
- metaslab_group_activate(mg);
- ASSERT(!vd->vdev_islog);
- metaslab_group_activate(vd->vdev_log_mg);
+ if (set_noalloc)
+ vdev_activate(vd);
spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
@@ -2206,7 +2384,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
int error = 0, error_log;
boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
sysevent_t *ev = NULL;
- char *vd_type = NULL, *vd_path = NULL;
+ const char *vd_type = NULL;
+ char *vd_path = NULL;
ASSERT(spa_writeable(spa));
@@ -2235,7 +2414,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
* in this pool.
*/
if (vd == NULL || unspare) {
- char *type;
+ const char *type;
boolean_t draid_spare = B_FALSE;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
@@ -2363,17 +2542,17 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
return (0);
}
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
"Ignore hard IO errors when removing device");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW,
"Largest contiguous segment to allocate when removing device");
-ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW,
"Largest span of free chunks a remap segment can span");
-ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW,
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW,
"Pause device removal after this many bytes are copied "
"(debug use only - causes removal to hang)");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_root.c b/sys/contrib/openzfs/module/zfs/vdev_root.c
index 45ddc2f71927..e132643dc330 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_root.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_root.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
index deea7fedd770..9cf10332e8bf 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_trim.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,9 +20,10 @@
*/
/*
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2024 by Delphix. All rights reserved.
* Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
* Copyright (c) 2021 Hewlett Packard Enterprise Development LP
+ * Copyright 2023 RackTop Systems, Inc.
*/
#include <sys/spa.h>
@@ -96,12 +97,12 @@
/*
* Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
*/
-unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
+static unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
/*
* Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
*/
-unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
+static unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
/*
* Skip uninitialized metaslabs during the TRIM process. This option is
@@ -118,7 +119,7 @@ unsigned int zfs_trim_metaslab_skip = 0;
* concurrent TRIM I/Os issued to the device is controlled by the
* zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
*/
-unsigned int zfs_trim_queue_limit = 10;
+static unsigned int zfs_trim_queue_limit = 10;
/*
* The minimum number of transaction groups between automatic trims of a
@@ -134,7 +135,7 @@ unsigned int zfs_trim_queue_limit = 10;
* has the opposite effect. The default value of 32 was determined though
* testing to be a reasonable compromise.
*/
-unsigned int zfs_trim_txg_batch = 32;
+static unsigned int zfs_trim_txg_batch = 32;
/*
* The trim_args are a control structure which describe how a leaf vdev
@@ -168,7 +169,8 @@ static boolean_t
vdev_trim_should_stop(vdev_t *vd)
{
return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
- vd->vdev_detached || vd->vdev_top->vdev_removing);
+ vd->vdev_detached || vd->vdev_top->vdev_removing ||
+ vd->vdev_top->vdev_rz_expanding);
}
/*
@@ -179,10 +181,31 @@ vdev_autotrim_should_stop(vdev_t *tvd)
{
return (tvd->vdev_autotrim_exit_wanted ||
!vdev_writeable(tvd) || tvd->vdev_removing ||
+ tvd->vdev_rz_expanding ||
spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
}
/*
+ * Wait for given number of kicks, return true if the wait is aborted due to
+ * vdev_autotrim_exit_wanted.
+ */
+static boolean_t
+vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick)
+{
+ mutex_enter(&vd->vdev_autotrim_lock);
+ for (int i = 0; i < num_of_kick; i++) {
+ if (vd->vdev_autotrim_exit_wanted)
+ break;
+ cv_wait_idle(&vd->vdev_autotrim_kick_cv,
+ &vd->vdev_autotrim_lock);
+ }
+ boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted;
+ mutex_exit(&vd->vdev_autotrim_lock);
+
+ return (exit_wanted);
+}
+
+/*
* The sync task for updating the on-disk state of a manual TRIM. This
* is scheduled by vdev_trim_change_state().
*/
@@ -202,7 +225,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
kmem_free(arg, sizeof (uint64_t));
vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
- if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ if (vd == NULL || vd->vdev_top->vdev_removing ||
+ !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
return;
uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
@@ -572,6 +596,7 @@ vdev_trim_ranges(trim_args_t *ta)
uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
spa_t *spa = vd->vdev_spa;
+ int error = 0;
ta->trim_start_time = gethrtime();
ta->trim_bytes_done = 0;
@@ -591,19 +616,32 @@ vdev_trim_ranges(trim_args_t *ta)
uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
for (uint64_t w = 0; w < writes_required; w++) {
- int error;
-
error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
rs_get_start(rs, ta->trim_tree) +
(w *extent_bytes_max), MIN(size -
(w * extent_bytes_max), extent_bytes_max));
if (error != 0) {
- return (error);
+ goto done;
}
}
}
- return (0);
+done:
+ /*
+ * Make sure all TRIMs for this metaslab have completed before
+ * returning. TRIM zios have lower priority over regular or syncing
+ * zios, so all TRIM zios for this metaslab must complete before the
+ * metaslab is re-enabled. Otherwise it's possible write zios to
+ * this metaslab could cut ahead of still queued TRIM zios for this
+ * metaslab causing corruption if the ranges overlap.
+ */
+ mutex_enter(&vd->vdev_trim_io_lock);
+ while (vd->vdev_trim_inflight[0] > 0) {
+ cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+ }
+ mutex_exit(&vd->vdev_trim_io_lock);
+
+ return (error);
}
static void
@@ -834,7 +872,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
* by its ms_allocatable. While a metaslab is undergoing trimming it is
* not eligible for new allocations.
*/
-static void
+static __attribute__((noreturn)) void
vdev_trim_thread(void *arg)
{
vdev_t *vd = arg;
@@ -922,11 +960,6 @@ vdev_trim_thread(void *arg)
}
spa_config_exit(spa, SCL_CONFIG, FTAG);
- mutex_enter(&vd->vdev_trim_io_lock);
- while (vd->vdev_trim_inflight[0] > 0) {
- cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
- }
- mutex_exit(&vd->vdev_trim_io_lock);
range_tree_destroy(ta.trim_tree);
@@ -976,6 +1009,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
ASSERT(!vd->vdev_detached);
ASSERT(!vd->vdev_trim_exit_wanted);
ASSERT(!vd->vdev_top->vdev_removing);
+ ASSERT(!vd->vdev_rz_expanding);
vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
vd->vdev_trim_thread = thread_create(NULL, 0,
@@ -1003,9 +1037,11 @@ vdev_trim_stop_wait_impl(vdev_t *vd)
void
vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
{
+ (void) spa;
vdev_t *vd;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_trim_lock);
@@ -1044,7 +1080,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
if (vd_list == NULL) {
vdev_trim_stop_wait_impl(vd);
} else {
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@@ -1080,7 +1117,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
list_t vd_list;
vdev_t *vd_l2cache;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_trim_node));
@@ -1113,7 +1151,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
void
vdev_trim_restart(vdev_t *vd)
{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ vd->vdev_spa->spa_load_thread == curthread);
ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
if (vd->vdev_leaf_zap != 0) {
@@ -1132,12 +1171,13 @@ vdev_trim_restart(vdev_t *vd)
ASSERT(err == 0 || err == ENOENT);
vd->vdev_trim_action_time = timestamp;
- if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
- vd->vdev_offline) {
+ if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
+ vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
/* load progress for reporting, but don't resume */
VERIFY0(vdev_trim_load(vd));
} else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
+ !vd->vdev_top->vdev_rz_expanding &&
vd->vdev_trim_thread == NULL) {
VERIFY0(vdev_trim_load(vd));
vdev_trim(vd, vd->vdev_trim_rate,
@@ -1174,7 +1214,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
* N.B. This behavior is different from a manual TRIM where a thread
* is created for each leaf vdev, instead of each top-level vdev.
*/
-static void
+static __attribute__((noreturn)) void
vdev_autotrim_thread(void *arg)
{
vdev_t *vd = arg;
@@ -1187,12 +1227,10 @@ vdev_autotrim_thread(void *arg)
mutex_exit(&vd->vdev_autotrim_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
- uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
- uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
-
while (!vdev_autotrim_should_stop(vd)) {
int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
- boolean_t issued_trim = B_FALSE;
+ uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
+ uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
/*
* All of the metaslabs are divided in to groups of size
@@ -1224,6 +1262,8 @@ vdev_autotrim_thread(void *arg)
i += txgs_per_trim) {
metaslab_t *msp = vd->vdev_ms[i];
range_tree_t *trim_tree;
+ boolean_t issued_trim = B_FALSE;
+ boolean_t wait_aborted = B_FALSE;
spa_config_exit(spa, SCL_CONFIG, FTAG);
metaslab_disable(msp);
@@ -1374,7 +1414,18 @@ vdev_autotrim_thread(void *arg)
range_tree_vacate(trim_tree, NULL, NULL);
range_tree_destroy(trim_tree);
- metaslab_enable(msp, issued_trim, B_FALSE);
+ /*
+ * Wait for couples of kicks, to ensure the trim io is
+ * synced. If the wait is aborted due to
+ * vdev_autotrim_exit_wanted, we need to signal
+ * metaslab_enable() to wait for sync.
+ */
+ if (issued_trim) {
+ wait_aborted = vdev_autotrim_wait_kick(vd,
+ TXG_CONCURRENT_STATES + TXG_DEFER_SIZE);
+ }
+
+ metaslab_enable(msp, wait_aborted, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
for (uint64_t c = 0; c < children; c++) {
@@ -1388,17 +1439,14 @@ vdev_autotrim_thread(void *arg)
}
kmem_free(tap, sizeof (trim_args_t) * children);
+
+ if (vdev_autotrim_should_stop(vd))
+ break;
}
spa_config_exit(spa, SCL_CONFIG, FTAG);
- /*
- * After completing the group of metaslabs wait for the next
- * open txg. This is done to make sure that a minimum of
- * zfs_trim_txg_batch txgs will occur before these metaslabs
- * are trimmed again.
- */
- txg_wait_open(spa_get_dsl(spa), 0, issued_trim);
+ vdev_autotrim_wait_kick(vd, 1);
shift++;
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -1454,7 +1502,8 @@ vdev_autotrim(spa_t *spa)
mutex_enter(&tvd->vdev_autotrim_lock);
if (vdev_writeable(tvd) && !tvd->vdev_removing &&
- tvd->vdev_autotrim_thread == NULL) {
+ tvd->vdev_autotrim_thread == NULL &&
+ !tvd->vdev_rz_expanding) {
ASSERT3P(tvd->vdev_top, ==, tvd);
tvd->vdev_autotrim_thread = thread_create(NULL, 0,
@@ -1476,11 +1525,9 @@ vdev_autotrim_stop_wait(vdev_t *tvd)
mutex_enter(&tvd->vdev_autotrim_lock);
if (tvd->vdev_autotrim_thread != NULL) {
tvd->vdev_autotrim_exit_wanted = B_TRUE;
-
- while (tvd->vdev_autotrim_thread != NULL) {
- cv_wait(&tvd->vdev_autotrim_cv,
- &tvd->vdev_autotrim_lock);
- }
+ cv_broadcast(&tvd->vdev_autotrim_kick_cv);
+ cv_wait(&tvd->vdev_autotrim_cv,
+ &tvd->vdev_autotrim_lock);
ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
tvd->vdev_autotrim_exit_wanted = B_FALSE;
@@ -1488,6 +1535,24 @@ vdev_autotrim_stop_wait(vdev_t *tvd)
mutex_exit(&tvd->vdev_autotrim_lock);
}
+void
+vdev_autotrim_kick(spa_t *spa)
+{
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ vdev_t *root_vd = spa->spa_root_vdev;
+ vdev_t *tvd;
+
+ for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
+ tvd = root_vd->vdev_child[i];
+
+ mutex_enter(&tvd->vdev_autotrim_lock);
+ if (tvd->vdev_autotrim_thread != NULL)
+ cv_broadcast(&tvd->vdev_autotrim_kick_cv);
+ mutex_exit(&tvd->vdev_autotrim_lock);
+ }
+}
+
/*
* Wait for all of the vdev_autotrim_thread associated with the pool to
* be terminated (canceled or stopped).
@@ -1507,19 +1572,19 @@ vdev_autotrim_stop_all(spa_t *spa)
void
vdev_autotrim_restart(spa_t *spa)
{
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
-
+ ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
+ spa->spa_load_thread == curthread);
if (spa->spa_autotrim)
vdev_autotrim(spa);
}
-static void
+static __attribute__((noreturn)) void
vdev_trim_l2arc_thread(void *arg)
{
vdev_t *vd = arg;
spa_t *spa = vd->vdev_spa;
l2arc_dev_t *dev = l2arc_vdev_get(vd);
- trim_args_t ta;
+ trim_args_t ta = {0};
range_seg64_t physical_rs;
ASSERT(vdev_is_concrete(vd));
@@ -1530,7 +1595,6 @@ vdev_trim_l2arc_thread(void *arg)
vd->vdev_trim_partial = 0;
vd->vdev_trim_secure = 0;
- bzero(&ta, sizeof (ta));
ta.trim_vdev = vd;
ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
ta.trim_type = TRIM_TYPE_MANUAL;
@@ -1590,7 +1654,7 @@ vdev_trim_l2arc_thread(void *arg)
*/
spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd,
RW_READER);
- bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
+ memset(dev->l2ad_dev_hdr, 0, dev->l2ad_dev_hdr_asize);
l2arc_dev_hdr_update(dev);
spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd);
@@ -1654,9 +1718,9 @@ vdev_trim_l2arc(spa_t *spa)
int
vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
{
- trim_args_t ta;
- range_seg64_t physical_rs;
- int error;
+ trim_args_t ta = {0};
+ range_seg64_t physical_rs;
+ int error;
physical_rs.rs_start = start;
physical_rs.rs_end = start + size;
@@ -1664,8 +1728,8 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
ASSERT(vd->vdev_ops->vdev_op_leaf);
ASSERT(!vd->vdev_detached);
ASSERT(!vd->vdev_top->vdev_removing);
+ ASSERT(!vd->vdev_top->vdev_rz_expanding);
- bzero(&ta, sizeof (ta));
ta.trim_vdev = vd;
ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
ta.trim_type = TRIM_TYPE_SIMPLE;
@@ -1708,19 +1772,17 @@ EXPORT_SYMBOL(vdev_autotrim_restart);
EXPORT_SYMBOL(vdev_trim_l2arc);
EXPORT_SYMBOL(vdev_trim_simple);
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW,
- "Max size of TRIM commands, larger will be split");
+ "Max size of TRIM commands, larger will be split");
ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW,
- "Min size of TRIM commands, smaller will be skipped");
+ "Min size of TRIM commands, smaller will be skipped");
ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW,
- "Skip metaslabs which have never been initialized");
+ "Skip metaslabs which have never been initialized");
ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW,
- "Min number of txgs to aggregate frees before issuing TRIM");
+ "Min number of txgs to aggregate frees before issuing TRIM");
ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW,
- "Max queued TRIMs outstanding per leaf vdev");
-/* END CSTYLED */
+ "Max queued TRIMs outstanding per leaf vdev");
diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c
index 6f03beef3bdb..03b76ea1b7bf 100644
--- a/sys/contrib/openzfs/module/zfs/zap.c
+++ b/sys/contrib/openzfs/module/zfs/zap.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -22,6 +22,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
+ * Copyright (c) 2023, Klara Inc.
*/
/*
@@ -41,6 +43,7 @@
#include <sys/spa.h>
#include <sys/dmu.h>
+#include <sys/dnode.h>
#include <sys/zfs_context.h>
#include <sys/zfs_znode.h>
#include <sys/fs/zfs.h>
@@ -76,13 +79,18 @@
* the zfs-specific implementation of the directory's st_size (which is
* the number of entries).
*/
-int zap_iterate_prefetch = B_TRUE;
+static int zap_iterate_prefetch = B_TRUE;
-int fzap_default_block_shift = 14; /* 16k blocksize */
+/*
+ * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
+ * collapsed into a single block.
+ */
+int zap_shrink_enabled = B_TRUE;
-extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+int fzap_default_block_shift = 14; /* 16k blocksize */
static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
void
fzap_byteswap(void *vbuf, size_t size)
@@ -114,7 +122,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
* explicitly zero it since it might be coming from an
* initialized microzap
*/
- bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+ memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size);
zp->zap_block_type = ZBT_HEADER;
zp->zap_magic = ZAP_MAGIC;
@@ -135,7 +143,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
* set up block 1 - the first leaf
*/
dmu_buf_t *db;
- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db, tx);
@@ -184,7 +192,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk;
ASSERT0(tbl->zt_blks_copied);
- dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ dmu_prefetch_by_dnode(zap->zap_dnode, 0,
tbl->zt_blk << bs, tbl->zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
}
@@ -195,21 +203,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
uint64_t b = tbl->zt_blks_copied;
dmu_buf_t *db_old;
- int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
/* first half of entries in old[b] go to new[2*b+0] */
dmu_buf_t *db_new;
- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
(newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);
/* second half of entries in old[b] go to new[2*b+1] */
- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
(newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -257,7 +265,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
uint64_t off = idx & ((1<<(bs-3))-1);
dmu_buf_t *db;
- int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
@@ -269,7 +277,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
uint64_t off2 = idx2 & ((1<<(bs-3))-1);
dmu_buf_t *db2;
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
DMU_READ_NO_PREFETCH);
if (err != 0) {
@@ -298,16 +306,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
uint64_t blk = idx >> (bs-3);
uint64_t off = idx & ((1<<(bs-3))-1);
- /*
- * Note: this is equivalent to dmu_buf_hold(), but we use
- * _dnode_enter / _by_dnode because it's faster because we don't
- * have to hold the dnode.
- */
- dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
dmu_buf_t *db;
- int err = dmu_buf_hold_by_dnode(dn,
+ int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
- dmu_buf_dnode_exit(zap->zap_dbuf);
if (err != 0)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
@@ -321,11 +322,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
*/
blk = (idx*2) >> (bs-3);
- dn = dmu_buf_dnode_enter(zap->zap_dbuf);
- err = dmu_buf_hold_by_dnode(dn,
+ err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
DMU_READ_NO_PREFETCH);
- dmu_buf_dnode_exit(zap->zap_dbuf);
if (err == 0)
dmu_buf_rele(db, FTAG);
}
@@ -370,7 +369,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
uint64_t newblk = zap_allocate_blocks(zap, 1);
dmu_buf_t *db_new;
- int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
DMU_READ_NO_PREFETCH);
if (err != 0)
@@ -426,20 +425,36 @@ zap_leaf_evict_sync(void *dbu)
static zap_leaf_t *
zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
{
- zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
-
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
- rw_enter(&l->l_rwlock, RW_WRITER);
- l->l_blkid = zap_allocate_blocks(zap, 1);
- l->l_dbuf = NULL;
+ uint64_t blkid = zap_allocate_blocks(zap, 1);
+ dmu_buf_t *db = NULL;
- VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
- l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+ VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
+ blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db,
DMU_READ_NO_PREFETCH));
- dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
- VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
+
+ /*
+ * Create the leaf structure and stash it on the dbuf. If zap was
+ * recent shrunk or truncated, the dbuf might have been sitting in the
+ * cache waiting to be evicted, and so still have the old leaf attached
+ * to it. If so, just reuse it.
+ */
+ zap_leaf_t *l = dmu_buf_get_user(db);
+ if (l == NULL) {
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l->l_blkid = blkid;
+ l->l_dbuf = db;
+ rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
+ dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL,
+ &l->l_dbuf);
+ dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
+ } else {
+ ASSERT3U(l->l_blkid, ==, blkid);
+ ASSERT3P(l->l_dbuf, ==, db);
+ }
+
+ rw_enter(&l->l_rwlock, RW_WRITER);
dmu_buf_will_dirty(l->l_dbuf, tx);
zap_leaf_init(l, zap->zap_normflags != 0);
@@ -535,10 +550,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
return (SET_ERROR(ENOENT));
int bs = FZAP_BLOCK_SHIFT(zap);
- dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
- int err = dmu_buf_hold_by_dnode(dn,
+ int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
- dmu_buf_dnode_exit(zap->zap_dbuf);
if (err != 0)
return (err);
@@ -600,6 +613,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
}
static int
+zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
+ dmu_tx_t *tx)
+{
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ int epb = bs >> 3; /* entries per block */
+ int err = 0;
+
+ ASSERT(tx != NULL);
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ /*
+ * Check for i/o errors
+ */
+ for (int i = 0; i < nptrs; i += epb) {
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, idx + i, &blk);
+ if (err != 0) {
+ return (err);
+ }
+ }
+
+ for (int i = 0; i < nptrs; i++) {
+ err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
+ ASSERT0(err); /* we checked for i/o errors above */
+ if (err != 0)
+ break;
+ }
+
+ return (err);
+}
+
+#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len)))
+
+/*
+ * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
+ * If two leaves are siblings, their ranges are adjecent and contain the same
+ * number of entries. In order to find out if a leaf has a sibling, we need to
+ * check the range corresponding to the sibling leaf. There is no need to check
+ * all entries in the range, we only need to check the frist and the last one.
+ */
+static uint64_t
+check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
+{
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+ uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
+ uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
+ uint64_t nptrs = (1 << pref_diff);
+ uint64_t first;
+ uint64_t last;
+
+ ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+
+ if (zap_idx_to_blk(zap, idx, &first) != 0)
+ return (0);
+
+ if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
+ return (0);
+
+ if (first != last)
+ return (0);
+ return (first);
+}
+
+static int
zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{
uint64_t blk;
@@ -628,7 +707,7 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
static int
zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
- void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
+ const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
{
zap_t *zap = zn->zn_zap;
uint64_t hash = zn->zn_hash;
@@ -648,6 +727,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
uint64_t object = zap->zap_object;
zap_put_leaf(l);
+ *lp = l = NULL;
zap_unlockdir(zap, tag);
err = zap_lockdir(os, object, tx, RW_WRITER,
FALSE, FALSE, tag, &zn->zn_zap);
@@ -717,7 +797,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
static void
zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
- void *tag, dmu_tx_t *tx)
+ const void *tag, dmu_tx_t *tx)
{
zap_t *zap = zn->zn_zap;
int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
@@ -826,7 +906,7 @@ fzap_lookup(zap_name_t *zn,
int
fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
- const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
+ const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err;
@@ -857,28 +937,24 @@ retry:
} else if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
- if (err == 0) {
+ if (err == 0)
goto retry;
- } else if (err == ENOSPC) {
- /*
- * If we failed to expand the leaf, then bailout
- * as there is no point trying
- * zap_put_leaf_maybe_grow_ptrtbl().
- */
- return (err);
- }
}
out:
- if (zap != NULL)
- zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ if (l != NULL) {
+ if (err == ENOSPC)
+ zap_put_leaf(l);
+ else
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ }
return (err);
}
int
fzap_add(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
- const void *val, void *tag, dmu_tx_t *tx)
+ const void *val, const void *tag, dmu_tx_t *tx)
{
int err = fzap_check(zn, integer_size, num_integers);
if (err != 0)
@@ -891,7 +967,7 @@ fzap_add(zap_name_t *zn,
int
fzap_update(zap_name_t *zn,
int integer_size, uint64_t num_integers, const void *val,
- void *tag, dmu_tx_t *tx)
+ const void *tag, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err;
@@ -928,8 +1004,12 @@ retry:
goto retry;
}
- if (zap != NULL)
- zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ if (l != NULL) {
+ if (err == ENOSPC)
+ zap_put_leaf(l);
+ else
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+ }
return (err);
}
@@ -948,9 +1028,9 @@ fzap_length(zap_name_t *zn,
if (err != 0)
goto out;
- if (integer_size != 0)
+ if (integer_size != NULL)
*integer_size = zeh.zeh_integer_size;
- if (num_integers != 0)
+ if (num_integers != NULL)
*num_integers = zeh.zeh_num_integers;
out:
zap_put_leaf(l);
@@ -971,6 +1051,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
if (err == 0) {
zap_entry_remove(&zeh);
zap_increment_num_entries(zn->zn_zap, -1, tx);
+
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
+ zap_shrink_enabled)
+ return (zap_shrink(zn, l, tx));
}
zap_put_leaf(l);
return (err);
@@ -987,7 +1071,7 @@ fzap_prefetch(zap_name_t *zn)
if (zap_idx_to_blk(zap, idx, &blk) != 0)
return;
int bs = FZAP_BLOCK_SHIFT(zap);
- dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+ dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
ZIO_PRIORITY_SYNC_READ);
}
@@ -1230,18 +1314,24 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
*/
if (zc->zc_hash == 0 && zap_iterate_prefetch &&
zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
- dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+ dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
ZIO_PRIORITY_ASYNC_READ);
}
- if (zc->zc_leaf &&
- (ZAP_HASH_IDX(zc->zc_hash,
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
- zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ if (zc->zc_leaf) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
- zap_put_leaf(zc->zc_leaf);
- zc->zc_leaf = NULL;
+
+ /*
+ * The leaf was either shrunk or split.
+ */
+ if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
+ (ZAP_HASH_IDX(zc->zc_hash,
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+ zap_put_leaf(zc->zc_leaf);
+ zc->zc_leaf = NULL;
+ }
}
again:
@@ -1250,8 +1340,6 @@ again:
&zc->zc_leaf);
if (err != 0)
return (err);
- } else {
- rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
}
l = zc->zc_leaf;
@@ -1358,7 +1446,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
} else {
- dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ dmu_prefetch_by_dnode(zap->zap_dnode, 0,
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
@@ -1368,7 +1456,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
dmu_buf_t *db;
int err;
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
FTAG, &db, DMU_READ_NO_PREFETCH);
if (err == 0) {
@@ -1380,7 +1468,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
}
}
-/* BEGIN CSTYLED */
+/*
+ * Find last allocated block and update freeblk.
+ */
+static void
+zap_trunc(zap_t *zap)
+{
+ uint64_t nentries;
+ uint64_t lastblk;
+
+ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
+ /* External ptrtbl */
+ nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
+ } else {
+ /* Embedded ptrtbl */
+ nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+ lastblk = 0;
+ }
+
+ for (uint64_t idx = 0; idx < nentries; idx++) {
+ uint64_t blk;
+ if (zap_idx_to_blk(zap, idx, &blk) != 0)
+ return;
+ if (blk > lastblk)
+ lastblk = blk;
+ }
+
+ ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
+
+ zap_f_phys(zap)->zap_freeblk = lastblk + 1;
+}
+
+/*
+ * ZAP shrinking algorithm.
+ *
+ * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
+ * only if it has a sibling. Sibling leaves have the same prefix length and
+ * their prefixes differ only by the least significant (sibling) bit. We require
+ * both siblings to be empty. This eliminates a need to rehash the non-empty
+ * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
+ * entries of the removed leaf to point out to the remaining leaf. Prefix length
+ * of the remaining leaf is decremented. As a result, it has a new prefix and it
+ * might have a new sibling. So, we repeat the process.
+ *
+ * Steps:
+ * 1. Check if a sibling leaf (sl) exists and it is empty.
+ * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
+ * 3. Release the sibling (sl) to derefer it again with WRITER lock.
+ * 4. Upgrade zapdir lock to WRITER (once).
+ * 5. Derefer released leaves again.
+ * 6. If it is needed, recheck whether both leaves are still siblings and empty.
+ * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
+ * the remaining leaf (slbit 0).
+ * 8. Free disk block of the removed leaf (dmu_free_range).
+ * 9. Decrement prefix_len of the remaining leaf.
+ * 10. Repeat the steps.
+ */
+static int
+zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+{
+ zap_t *zap = zn->zn_zap;
+ int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ uint64_t hash = zn->zn_hash;
+ uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ boolean_t trunc = B_FALSE;
+ int err = 0;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+ ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
+
+ boolean_t writer = B_FALSE;
+
+ /*
+ * To avoid deadlock always deref leaves in the same order -
+ * sibling 0 first, then sibling 1.
+ */
+ while (prefix_len) {
+ zap_leaf_t *sl;
+ int64_t prefix_diff = zt_shift - prefix_len;
+ uint64_t sl_prefix = prefix ^ 1;
+ uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
+ int slbit = prefix & 1;
+
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
+
+ /*
+ * Check if there is a sibling by reading ptrtbl ptrs.
+ */
+ if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
+ break;
+
+ /*
+ * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
+ */
+ if (slbit == 1) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Dereference sibling leaf and check if it is empty.
+ */
+ if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
+ &sl)) != 0)
+ break;
+
+ ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
+
+ /*
+ * Check if we have a sibling and it is empty.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
+ zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ zap_put_leaf(sl);
+
+ /*
+ * If there two empty sibling, we have work to do, so
+ * we need to lock ZAP ptrtbl as WRITER.
+ */
+ if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
+ /* We failed to upgrade */
+ if (l != NULL) {
+ zap_put_leaf(l);
+ l = NULL;
+ }
+
+ /*
+ * Usually, the right way to upgrade from a READER lock
+ * to a WRITER lock is to call zap_unlockdir() and
+ * zap_lockdir(), but we do not have a tag. Instead,
+ * we do it in more sophisticated way.
+ */
+ rw_exit(&zap->zap_rwlock);
+ rw_enter(&zap->zap_rwlock, RW_WRITER);
+ dmu_buf_will_dirty(zap->zap_dbuf, tx);
+
+ zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ writer = B_TRUE;
+ }
+
+ /*
+ * Here we have WRITER lock for ptrtbl.
+ * Now, we need a WRITER lock for both siblings leaves.
+ * Also, we have to recheck if the leaves are still siblings
+ * and still empty.
+ */
+ if (l == NULL) {
+ /* sibling 0 */
+ if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
+ tx, RW_WRITER, &l)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
+ break;
+ }
+
+ /* sibling 1 */
+ if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
+ RW_WRITER, &sl)) != 0)
+ break;
+
+ /*
+ * The leaf isn't empty anymore or
+ * it was shrunk/split while our locks were down.
+ */
+ if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
+ zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /* If we have gotten here, we have a leaf to collapse */
+ uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
+ uint64_t nptrs = (1ULL << prefix_diff);
+ uint64_t sl_blkid = sl->l_blkid;
+
+ /*
+ * Set ptrtbl entries to point out to the slibling 0 blkid
+ */
+ if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
+ tx)) != 0) {
+ zap_put_leaf(sl);
+ break;
+ }
+
+ /*
+ * Free sibling 1 disk block.
+ */
+ int bs = FZAP_BLOCK_SHIFT(zap);
+ if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
+ trunc = B_TRUE;
+
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
+ sl_blkid << bs, 1 << bs, tx);
+ zap_put_leaf(sl);
+
+ zap_f_phys(zap)->zap_num_leafs--;
+
+ /*
+ * Update prefix and prefix_len.
+ */
+ zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
+
+ prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
+ prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ }
+
+ if (trunc)
+ zap_trunc(zap);
+
+ if (l != NULL)
+ zap_put_leaf(l);
+
+ return (err);
+}
+
+/* CSTYLED */
ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
"When iterating ZAP object, prefetch it");
-/* END CSTYLED */
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
+ "Enable ZAP shrinking");
diff --git a/sys/contrib/openzfs/module/zfs/zap_leaf.c b/sys/contrib/openzfs/module/zfs/zap_leaf.c
index aa6c298c3b4b..032aca92695e 100644
--- a/sys/contrib/openzfs/module/zfs/zap_leaf.c
+++ b/sys/contrib/openzfs/module/zfs/zap_leaf.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -41,7 +41,8 @@
#include <sys/zap_leaf.h>
#include <sys/arc.h>
-static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
+ uint16_t entry);
#define CHAIN_END 0xffff /* end of the chunk chain */
@@ -52,18 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
-extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
-
-static void
-zap_memset(void *a, int c, size_t n)
-{
- char *cp = a;
- char *cpend = cp + n;
-
- while (cp < cpend)
- *cp++ = c;
-}
-
static void
stv(int len, void *addr, uint64_t value)
{
@@ -81,7 +70,7 @@ stv(int len, void *addr, uint64_t value)
*(uint64_t *)addr = value;
return;
default:
- cmn_err(CE_PANIC, "bad int len %d", len);
+ PANIC("bad int len %d", len);
}
}
@@ -98,13 +87,13 @@ ldv(int len, const void *addr)
case 8:
return (*(uint64_t *)addr);
default:
- cmn_err(CE_PANIC, "bad int len %d", len);
+ PANIC("bad int len %d", len);
}
return (0xFEEDFACEDEADBEEFULL);
}
void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
{
zap_leaf_t l;
dmu_buf_t l_dbuf;
@@ -121,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
- for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+ for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
- for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+ for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
struct zap_leaf_entry *le;
@@ -162,11 +151,11 @@ void
zap_leaf_init(zap_leaf_t *l, boolean_t sort)
{
l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
- zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+ memset(&zap_leaf_phys(l)->l_hdr, 0,
sizeof (struct zap_leaf_header));
- zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
2*ZAP_LEAF_HASH_NUMENTRIES(l));
- for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
}
@@ -187,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
{
ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
- int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+ uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
@@ -209,7 +198,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
zlf->lf_type = ZAP_CHUNK_FREE;
zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
- bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+ memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */
zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
zap_leaf_phys(l)->l_hdr.lh_nfree++;
@@ -225,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
{
uint16_t chunk_head;
uint16_t *chunkp = &chunk_head;
- int byten = 0;
+ int byten = integer_size;
uint64_t value = 0;
int shift = (integer_size - 1) * 8;
int len = num_integers;
ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
+ if (len > 0)
+ value = ldv(integer_size, buf);
while (len > 0) {
uint16_t chunk = zap_leaf_chunk_alloc(l);
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
la->la_type = ZAP_CHUNK_ARRAY;
for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
- if (byten == 0)
- value = ldv(integer_size, buf);
la->la_array[i] = value >> shift;
value <<= 8;
- if (++byten == integer_size) {
- byten = 0;
- buf += integer_size;
+ if (--byten == 0) {
if (--len == 0)
break;
+ byten = integer_size;
+ buf += integer_size;
+ value = ldv(integer_size, buf);
}
}
@@ -266,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
*chunkp = CHAIN_END;
while (chunk != CHAIN_END) {
- int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+ uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
ZAP_CHUNK_ARRAY);
zap_leaf_chunk_free(l, chunk);
@@ -306,7 +296,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
while (chunk != CHAIN_END) {
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
- bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+ memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES);
p += ZAP_LEAF_ARRAY_BYTES;
chunk = la->la_next;
}
@@ -317,7 +307,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
value = (value << 8) | la->la_array[i];
byten++;
if (byten == array_int_len) {
@@ -335,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
static boolean_t
zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
- int chunk, int array_numints)
+ uint_t chunk, int array_numints)
{
int bseen = 0;
@@ -346,7 +336,7 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
sizeof (*thiskey), array_numints, thiskey);
- boolean_t match = bcmp(thiskey, zn->zn_key_orig,
+ boolean_t match = memcmp(thiskey, zn->zn_key_orig,
array_numints * sizeof (*thiskey)) == 0;
kmem_free(thiskey, array_numints * sizeof (*thiskey));
return (match);
@@ -374,7 +364,8 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
+ if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen,
+ toread))
break;
chunk = la->la_next;
bseen += toread;
@@ -563,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
uint64_t valuelen = integer_size * num_integers;
- int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+ uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
return (SET_ERROR(E2BIG));
@@ -625,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
/* link it into the hash chain */
/* XXX if we did the search above, we could just use that */
- uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+ uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
zap_leaf_phys(l)->l_hdr.lh_nentries++;
@@ -647,7 +638,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
* form of the name. But all callers have one of these on hand anyway,
* so might as well take advantage. A cleaner but slower interface
* would accept neither argument, and compute the normalized name as
- * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
+ * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
*/
boolean_t
zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
@@ -668,7 +659,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
continue;
if (zn == NULL) {
- zn = zap_name_alloc(zap, name, MT_NORMALIZE);
+ zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
allocdzn = B_TRUE;
}
if (zap_leaf_array_match(zeh->zeh_leaf, zn,
@@ -688,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
*/
static uint16_t *
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
{
- struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
struct zap_leaf_entry *le2;
uint16_t *chunkp;
@@ -723,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
&ZAP_LEAF_CHUNK(nl, nchunk).l_array;
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
- int nextchunk = la->la_next;
+ uint_t nextchunk = la->la_next;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
@@ -740,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
}
static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
{
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
@@ -749,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
*nle = *le; /* structure assignment */
- (void) zap_leaf_rehash_entry(nl, chunk);
+ (void) zap_leaf_rehash_entry(nl, nle, chunk);
nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
nle->le_value_chunk =
@@ -767,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
void
zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
{
- int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+ uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
/* set new prefix and prefix_len */
zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
@@ -778,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
/* break existing hash chains */
- zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
2*ZAP_LEAF_HASH_NUMENTRIES(l));
if (sort)
@@ -793,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
* but this accesses memory more sequentially, and when we're
* called, the block is usually pretty full.
*/
- for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
if (le->le_type != ZAP_CHUNK_ENTRY)
continue;
@@ -801,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
if (le->le_hash & (1ULL << bit))
zap_leaf_transfer_entry(l, i, nl);
else
- (void) zap_leaf_rehash_entry(l, i);
+ (void) zap_leaf_rehash_entry(l, le, i);
}
}
void
zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
{
- int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_leafs_with_2n_pointers[n]++;
@@ -824,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_blocks_n_tenths_full[n]++;
- for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
- int nentries = 0;
- int chunk = zap_leaf_phys(l)->l_hash[i];
+ for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+ uint_t nentries = 0;
+ uint_t chunk = zap_leaf_phys(l)->l_hash[i];
while (chunk != CHAIN_END) {
struct zap_leaf_entry *le =
diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c
index b4611685b204..d806988af96d 100644
--- a/sys/contrib/openzfs/module/zfs/zap_micro.c
+++ b/sys/contrib/openzfs/module/zfs/zap_micro.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -33,7 +33,7 @@
#include <sys/zap.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
-#include <sys/avl.h>
+#include <sys/btree.h>
#include <sys/arc.h>
#include <sys/dmu_objset.h>
@@ -41,10 +41,10 @@
#include <sys/sunddi.h>
#endif
-extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+int zap_micro_max_size = MZAP_MAX_BLKSZ;
static int mzap_upgrade(zap_t **zapp,
- void *tag, dmu_tx_t *tx, zap_flags_t flags);
+ const void *tag, dmu_tx_t *tx, zap_flags_t flags);
uint64_t
zap_getflags(zap_t *zap)
@@ -94,7 +94,7 @@ zap_hash(zap_name_t *zn)
wp++, i++) {
uint64_t word = *wp;
- for (int j = 0; j < zn->zn_key_intlen; j++) {
+ for (int j = 0; j < 8; j++) {
h = (h >> 8) ^
zfs_crc64_table[(h ^ word) & 0xFF];
word >>= NBBY;
@@ -164,18 +164,25 @@ zap_match(zap_name_t *zn, const char *matchname)
}
}
+static zap_name_t *
+zap_name_alloc(zap_t *zap)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+ zn->zn_zap = zap;
+ return (zn);
+}
+
void
zap_name_free(zap_name_t *zn)
{
kmem_free(zn, sizeof (zap_name_t));
}
-zap_name_t *
-zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
+static int
+zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
{
- zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+ zap_t *zap = zn->zn_zap;
- zn->zn_zap = zap;
zn->zn_key_intlen = sizeof (*key);
zn->zn_key_orig = key;
zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
@@ -196,17 +203,13 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
* what the hash is computed from.
*/
if (zap_normalize(zap, key, zn->zn_normbuf,
- zap->zap_normflags) != 0) {
- zap_name_free(zn);
- return (NULL);
- }
+ zap->zap_normflags) != 0)
+ return (SET_ERROR(ENOTSUP));
zn->zn_key_norm = zn->zn_normbuf;
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
} else {
- if (mt != 0) {
- zap_name_free(zn);
- return (NULL);
- }
+ if (mt != 0)
+ return (SET_ERROR(ENOTSUP));
zn->zn_key_norm = zn->zn_key_orig;
zn->zn_key_norm_numints = zn->zn_key_orig_numints;
}
@@ -219,13 +222,22 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
* what the matching is based on. (Not the hash!)
*/
if (zap_normalize(zap, key, zn->zn_normbuf,
- zn->zn_normflags) != 0) {
- zap_name_free(zn);
- return (NULL);
- }
+ zn->zn_normflags) != 0)
+ return (SET_ERROR(ENOTSUP));
zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
}
+ return (0);
+}
+
+zap_name_t *
+zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
+{
+ zap_name_t *zn = zap_name_alloc(zap);
+ if (zap_name_init_str(zn, key, mt) != 0) {
+ zap_name_free(zn);
+ return (NULL);
+ }
return (zn);
}
@@ -273,51 +285,56 @@ zap_byteswap(void *buf, size_t size)
}
}
+__attribute__((always_inline)) inline
static int
mze_compare(const void *arg1, const void *arg2)
{
const mzap_ent_t *mze1 = arg1;
const mzap_ent_t *mze2 = arg2;
- int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash);
- if (likely(cmp))
- return (cmp);
-
- return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
+ return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
+ (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
}
+ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
+ mze_compare)
+
static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash)
+mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
{
+ mzap_ent_t mze;
+
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
- mze->mze_chunkid = chunkid;
- mze->mze_hash = hash;
- mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
- ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
- avl_add(&zap->zap_m.zap_avl, mze);
+ mze.mze_chunkid = chunkid;
+ ASSERT0(hash & 0xffffffff);
+ mze.mze_hash = hash >> 32;
+ ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
+ mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
+ ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
+ zfs_btree_add(&zap->zap_m.zap_tree, &mze);
}
static mzap_ent_t *
-mze_find(zap_name_t *zn)
+mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
{
mzap_ent_t mze_tofind;
mzap_ent_t *mze;
- avl_index_t idx;
- avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
+ zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
ASSERT(zn->zn_zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
- mze_tofind.mze_hash = zn->zn_hash;
+ ASSERT0(zn->zn_hash & 0xffffffff);
+ mze_tofind.mze_hash = zn->zn_hash >> 32;
mze_tofind.mze_cd = 0;
- mze = avl_find(avl, &mze_tofind, &idx);
+ mze = zfs_btree_find(tree, &mze_tofind, idx);
if (mze == NULL)
- mze = avl_nearest(avl, idx, AVL_AFTER);
- for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
+ mze = zfs_btree_next(tree, idx, idx);
+ for (; mze && mze->mze_hash == mze_tofind.mze_hash;
+ mze = zfs_btree_next(tree, idx, idx)) {
ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
return (mze);
@@ -330,18 +347,21 @@ static uint32_t
mze_find_unused_cd(zap_t *zap, uint64_t hash)
{
mzap_ent_t mze_tofind;
- avl_index_t idx;
- avl_tree_t *avl = &zap->zap_m.zap_avl;
+ zfs_btree_index_t idx;
+ zfs_btree_t *tree = &zap->zap_m.zap_tree;
ASSERT(zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ ASSERT0(hash & 0xffffffff);
+ hash >>= 32;
mze_tofind.mze_hash = hash;
mze_tofind.mze_cd = 0;
uint32_t cd = 0;
- for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
- mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash;
+ mze = zfs_btree_next(tree, &idx, &idx)) {
if (mze->mze_cd != cd)
break;
cd++;
@@ -366,16 +386,18 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
{
zap_t *zap = zn->zn_zap;
mzap_ent_t mze_tofind;
- mzap_ent_t *mze;
- avl_index_t idx;
- avl_tree_t *avl = &zap->zap_m.zap_avl;
+ zfs_btree_index_t idx;
+ zfs_btree_t *tree = &zap->zap_m.zap_tree;
uint32_t mzap_ents = 0;
+ ASSERT0(hash & 0xffffffff);
+ hash >>= 32;
mze_tofind.mze_hash = hash;
mze_tofind.mze_cd = 0;
- for (mze = avl_find(avl, &mze_tofind, &idx);
- mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash;
+ mze = zfs_btree_next(tree, &idx, &idx)) {
mzap_ents++;
}
@@ -386,28 +408,14 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
}
static void
-mze_remove(zap_t *zap, mzap_ent_t *mze)
-{
- ASSERT(zap->zap_ismicro);
- ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-
- avl_remove(&zap->zap_m.zap_avl, mze);
- kmem_free(mze, sizeof (mzap_ent_t));
-}
-
-static void
mze_destroy(zap_t *zap)
{
- mzap_ent_t *mze;
- void *avlcookie = NULL;
-
- while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
- kmem_free(mze, sizeof (mzap_ent_t));
- avl_destroy(&zap->zap_m.zap_avl);
+ zfs_btree_clear(&zap->zap_m.zap_tree);
+ zfs_btree_destroy(&zap->zap_m.zap_tree);
}
static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+mzap_open(dmu_buf_t *db)
{
zap_t *winner;
uint64_t *zap_hdr = (uint64_t *)db->db_data;
@@ -419,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
rw_enter(&zap->zap_rwlock, RW_WRITER);
- zap->zap_objset = os;
- zap->zap_object = obj;
+ zap->zap_objset = dmu_buf_get_objset(db);
+ zap->zap_object = db->db_object;
zap->zap_dbuf = db;
if (zap_block_type != ZBT_MICRO) {
@@ -450,21 +458,26 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
zap->zap_salt = zap_m_phys(zap)->mz_salt;
zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
- avl_create(&zap->zap_m.zap_avl, mze_compare,
- sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
- for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ /*
+ * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
+ * overhead on massive inserts below. It still allows to store
+ * 62 entries before we have to add 2KB B-tree core node.
+ */
+ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
+ mze_find_in_buf, sizeof (mzap_ent_t), 512);
+
+ zap_name_t *zn = zap_name_alloc(zap);
+ for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze =
&zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0]) {
- zap_name_t *zn;
-
zap->zap_m.zap_num_entries++;
- zn = zap_name_alloc(zap, mze->mze_name, 0);
+ zap_name_init_str(zn, mze->mze_name, 0);
mze_insert(zap, i, zn->zn_hash);
- zap_name_free(zn);
}
}
+ zap_name_free(zn);
} else {
zap->zap_salt = zap_f_phys(zap)->zap_salt;
zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
@@ -505,7 +518,7 @@ handle_winner:
* have the specified tag.
*/
static int
-zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
+zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
{
ASSERT0(db->db_offset);
@@ -515,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
*zapp = NULL;
- dmu_object_info_from_db(db, &doi);
+ dmu_object_info_from_dnode(dn, &doi);
if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
return (SET_ERROR(EINVAL));
zap_t *zap = dmu_buf_get_user(db);
if (zap == NULL) {
- zap = mzap_open(os, obj, db);
+ zap = mzap_open(db);
if (zap == NULL) {
/*
* mzap_open() didn't like what it saw on-disk.
@@ -550,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
}
zap->zap_objset = os;
+ zap->zap_dnode = dn;
if (lt == RW_WRITER)
dmu_buf_will_dirty(db, tx);
@@ -561,7 +575,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
if (zap->zap_ismicro && tx && adding &&
zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
- if (newsz > MZAP_MAX_BLKSZ) {
+ if (newsz > zap_micro_max_size) {
dprintf("upgrading obj %llu: num_entries=%u\n",
(u_longlong_t)obj, zap->zap_m.zap_num_entries);
*zapp = zap;
@@ -581,60 +595,58 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
static int
zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
- krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+ krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
+ zap_t **zapp)
{
dmu_buf_t *db;
+ int err;
- int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
- if (err != 0) {
+ err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0)
return (err);
- }
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(db, &doi);
- ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
- }
-#endif
-
- err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
- if (err != 0) {
+ err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0)
dmu_buf_rele(db, tag);
- }
+ else
+ VERIFY(dnode_add_ref(dn, tag));
return (err);
}
int
zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
- krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+ krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
+ zap_t **zapp)
{
+ dnode_t *dn;
dmu_buf_t *db;
+ int err;
- int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ err = dnode_hold(os, obj, tag, &dn);
if (err != 0)
return (err);
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(db, &doi);
- ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+ err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0) {
+ dnode_rele(dn, tag);
+ return (err);
}
-#endif
- err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
- if (err != 0)
+ err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0) {
dmu_buf_rele(db, tag);
+ dnode_rele(dn, tag);
+ }
return (err);
}
void
-zap_unlockdir(zap_t *zap, void *tag)
+zap_unlockdir(zap_t *zap, const void *tag)
{
rw_exit(&zap->zap_rwlock);
+ dnode_rele(zap->zap_dnode, tag);
dmu_buf_rele(zap->zap_dbuf, tag);
}
static int
-mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
+mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
{
int err = 0;
zap_t *zap = *zapp;
@@ -643,7 +655,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
int sz = zap->zap_dbuf->db_size;
mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
- bcopy(zap->zap_dbuf->db_data, mzp, sz);
+ memcpy(mzp, zap->zap_dbuf->db_data, sz);
int nchunks = zap->zap_m.zap_num_chunks;
if (!flags) {
@@ -657,24 +669,25 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
dprintf("upgrading obj=%llu with %u chunks\n",
(u_longlong_t)zap->zap_object, nchunks);
- /* XXX destroy the avl later, so we can use the stored hash value */
+ /* XXX destroy the tree later, so we can use the stored hash value */
mze_destroy(zap);
fzap_upgrade(zap, tx, flags);
+ zap_name_t *zn = zap_name_alloc(zap);
for (int i = 0; i < nchunks; i++) {
mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
if (mze->mze_name[0] == 0)
continue;
dprintf("adding %s=%llu\n",
mze->mze_name, (u_longlong_t)mze->mze_value);
- zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
+ zap_name_init_str(zn, mze->mze_name, 0);
/* If we fail here, we would end up losing entries */
VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
tag, tx));
zap = zn->zn_zap; /* fzap_add_cd() may change zap */
- zap_name_free(zn);
}
+ zap_name_free(zn);
vmem_free(mzp, sz);
*zapp = zap;
return (0);
@@ -714,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
if (flags != 0) {
zap_t *zap;
/* Only fat zap supports flags; upgrade immediately. */
- VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+ VERIFY(dnode_add_ref(dn, FTAG));
+ VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
B_FALSE, B_FALSE, &zap));
VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
zap_unlockdir(zap, FTAG);
@@ -727,7 +741,7 @@ static uint64_t
zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, int dnodesize,
- dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+ dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
{
uint64_t obj;
@@ -859,7 +873,7 @@ uint64_t
zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, int dnodesize,
- dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+ dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
{
return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
indirect_blockshift, bonustype, bonuslen, dnodesize,
@@ -916,22 +930,23 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
* See also the comment above zap_entry_normalization_conflict().
*/
static boolean_t
-mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
+mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
+ zfs_btree_index_t *idx)
{
- int direction = AVL_BEFORE;
boolean_t allocdzn = B_FALSE;
+ mzap_ent_t *other;
+ zfs_btree_index_t oidx;
if (zap->zap_normflags == 0)
return (B_FALSE);
-again:
- for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+ for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
other && other->mze_hash == mze->mze_hash;
- other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
+ other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
if (zn == NULL) {
- zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
- MT_NORMALIZE);
+ zn = zap_name_alloc_str(zap,
+ MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
allocdzn = B_TRUE;
}
if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
@@ -941,9 +956,20 @@ again:
}
}
- if (direction == AVL_BEFORE) {
- direction = AVL_AFTER;
- goto again;
+ for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
+ other && other->mze_hash == mze->mze_hash;
+ other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
+
+ if (zn == NULL) {
+ zn = zap_name_alloc_str(zap,
+ MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
+ allocdzn = B_TRUE;
+ }
+ if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
+ if (allocdzn)
+ zap_name_free(zn);
+ return (B_TRUE);
+ }
}
if (allocdzn)
@@ -971,7 +997,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
{
int err = 0;
- zap_name_t *zn = zap_name_alloc(zap, name, mt);
+ zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
if (zn == NULL)
return (SET_ERROR(ENOTSUP));
@@ -979,7 +1005,8 @@ zap_lookup_impl(zap_t *zap, const char *name,
err = fzap_lookup(zn, integer_size, num_integers, buf,
realname, rn_len, ncp);
} else {
- mzap_ent_t *mze = mze_find(zn);
+ zfs_btree_index_t idx;
+ mzap_ent_t *mze = mze_find(zn, &idx);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
@@ -990,11 +1017,13 @@ zap_lookup_impl(zap_t *zap, const char *name,
} else {
*(uint64_t *)buf =
MZE_PHYS(zap, mze)->mze_value;
- (void) strlcpy(realname,
- MZE_PHYS(zap, mze)->mze_name, rn_len);
+ if (realname != NULL)
+ (void) strlcpy(realname,
+ MZE_PHYS(zap, mze)->mze_name,
+ rn_len);
if (ncp) {
*ncp = mzap_normalization_conflict(zap,
- zn, mze);
+ zn, mze, &idx);
}
}
}
@@ -1031,7 +1060,7 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
- zn = zap_name_alloc(zap, name, 0);
+ zn = zap_name_alloc_str(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1134,7 +1163,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
- zap_name_t *zn = zap_name_alloc(zap, name, 0);
+ zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1142,7 +1171,8 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
if (!zap->zap_ismicro) {
err = fzap_length(zn, integer_size, num_integers);
} else {
- mzap_ent_t *mze = mze_find(zn);
+ zfs_btree_index_t idx;
+ mzap_ent_t *mze = mze_find(zn, &idx);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
@@ -1182,7 +1212,7 @@ static void
mzap_addent(zap_name_t *zn, uint64_t value)
{
zap_t *zap = zn->zn_zap;
- int start = zap->zap_m.zap_alloc_next;
+ uint16_t start = zap->zap_m.zap_alloc_next;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -1198,7 +1228,7 @@ mzap_addent(zap_name_t *zn, uint64_t value)
ASSERT(cd < zap_maxcd(zap));
again:
- for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0] == 0) {
mze->mze_value = value;
@@ -1224,12 +1254,12 @@ again:
static int
zap_add_impl(zap_t *zap, const char *key,
int integer_size, uint64_t num_integers,
- const void *val, dmu_tx_t *tx, void *tag)
+ const void *val, dmu_tx_t *tx, const void *tag)
{
const uint64_t *intval = val;
int err = 0;
- zap_name_t *zn = zap_name_alloc(zap, key, 0);
+ zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
if (zn == NULL) {
zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP));
@@ -1247,7 +1277,8 @@ zap_add_impl(zap_t *zap, const char *key,
}
zap = zn->zn_zap; /* fzap_add() may change zap */
} else {
- if (mze_find(zn) != NULL) {
+ zfs_btree_index_t idx;
+ if (mze_find(zn, &idx) != NULL) {
err = SET_ERROR(EEXIST);
} else {
mzap_addent(zn, *intval);
@@ -1292,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
return (err);
}
+static int
+zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx, const void *tag)
+{
+ int err;
+
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap, tag);
+ return (err);
+}
+
int
zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
@@ -1303,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
- if (zn == NULL) {
- zap_unlockdir(zap, FTAG);
- return (SET_ERROR(ENOTSUP));
- }
- err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
- zap = zn->zn_zap; /* fzap_add() may change zap */
- zap_name_free(zn);
- if (zap != NULL) /* may be NULL if fzap_add() failed */
- zap_unlockdir(zap, FTAG);
+ err = zap_add_uint64_impl(zap, key, key_numints,
+ integer_size, num_integers, val, tx, FTAG);
+ /* zap_add_uint64_impl() calls zap_unlockdir() */
+ return (err);
+}
+
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_add_uint64_impl(zap, key, key_numints,
+ integer_size, num_integers, val, tx, FTAG);
+ /* zap_add_uint64_impl() calls zap_unlockdir() */
return (err);
}
@@ -1327,7 +1388,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
- zap_name_t *zn = zap_name_alloc(zap, name, 0);
+ zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1348,7 +1409,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
}
zap = zn->zn_zap; /* fzap_update() may change zap */
} else {
- mzap_ent_t *mze = mze_find(zn);
+ zfs_btree_index_t idx;
+ mzap_ent_t *mze = mze_find(zn, &idx);
if (mze != NULL) {
MZE_PHYS(zap, mze)->mze_value = *intval;
} else {
@@ -1362,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
return (err);
}
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
- int key_numints,
- int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+static int
+zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
+ const void *tag)
{
- zap_t *zap;
+ int err;
- int err =
- zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
- if (err != 0)
- return (err);
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
- zap_unlockdir(zap, FTAG);
+ zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP));
}
- err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+ err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
- zap_unlockdir(zap, FTAG);
+ zap_unlockdir(zap, tag);
+ return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers, const void *val,
+ dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_update_uint64_impl(zap, key, key_numints,
+ integer_size, num_integers, val, tx, FTAG);
+ /* zap_update_uint64_impl() calls zap_unlockdir() */
+ return (err);
+}
+
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_update_uint64_impl(zap, key, key_numints,
+ integer_size, num_integers, val, tx, FTAG);
+ /* zap_update_uint64_impl() calls zap_unlockdir() */
return (err);
}
@@ -1398,20 +1489,20 @@ zap_remove_impl(zap_t *zap, const char *name,
{
int err = 0;
- zap_name_t *zn = zap_name_alloc(zap, name, mt);
+ zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
if (zn == NULL)
return (SET_ERROR(ENOTSUP));
if (!zap->zap_ismicro) {
err = fzap_remove(zn, tx);
} else {
- mzap_ent_t *mze = mze_find(zn);
+ zfs_btree_index_t idx;
+ mzap_ent_t *mze = mze_find(zn, &idx);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
zap->zap_m.zap_num_entries--;
- bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
- sizeof (mzap_ent_phys_t));
- mze_remove(zap, mze);
+ memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
+ zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
}
}
zap_name_free(zn);
@@ -1447,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
return (err);
}
+static int
+zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+ dmu_tx_t *tx, const void *tag)
+{
+ int err;
+
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+ err = fzap_remove(zn, tx);
+ zap_name_free(zn);
+ zap_unlockdir(zap, tag);
+ return (err);
+}
+
int
zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx)
@@ -1457,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
- zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
- if (zn == NULL) {
- zap_unlockdir(zap, FTAG);
- return (SET_ERROR(ENOTSUP));
- }
- err = fzap_remove(zn, tx);
- zap_name_free(zn);
- zap_unlockdir(zap, FTAG);
+ err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+ /* zap_remove_uint64_impl() calls zap_unlockdir() */
+ return (err);
+}
+
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+ dmu_tx_t *tx)
+{
+ zap_t *zap;
+
+ int err =
+ zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+ /* zap_remove_uint64_impl() calls zap_unlockdir() */
return (err);
}
@@ -1582,29 +1699,30 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
if (!zc->zc_zap->zap_ismicro) {
err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
} else {
- avl_index_t idx;
+ zfs_btree_index_t idx;
mzap_ent_t mze_tofind;
- mze_tofind.mze_hash = zc->zc_hash;
+ mze_tofind.mze_hash = zc->zc_hash >> 32;
mze_tofind.mze_cd = zc->zc_cd;
- mzap_ent_t *mze =
- avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+ mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
+ &mze_tofind, &idx);
if (mze == NULL) {
- mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
- idx, AVL_AFTER);
+ mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
+ &idx, &idx);
}
if (mze) {
mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
za->za_normalization_conflict =
- mzap_normalization_conflict(zc->zc_zap, NULL, mze);
+ mzap_normalization_conflict(zc->zc_zap, NULL,
+ mze, &idx);
za->za_integer_length = 8;
za->za_num_integers = 1;
za->za_first_integer = mzep->mze_value;
(void) strlcpy(za->za_name, mzep->mze_name,
sizeof (za->za_name));
- zc->zc_hash = mze->mze_hash;
+ zc->zc_hash = (uint64_t)mze->mze_hash << 32;
zc->zc_cd = mze->mze_cd;
err = 0;
} else {
@@ -1634,7 +1752,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
if (err != 0)
return (err);
- bzero(zs, sizeof (zap_stats_t));
+ memset(zs, 0, sizeof (zap_stats_t));
if (zap->zap_ismicro) {
zs->zs_blocksize = zap->zap_dbuf->db_size;
@@ -1669,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64);
EXPORT_SYMBOL(zap_add);
EXPORT_SYMBOL(zap_add_by_dnode);
EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_add_uint64_by_dnode);
EXPORT_SYMBOL(zap_update);
EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_update_uint64_by_dnode);
EXPORT_SYMBOL(zap_length);
EXPORT_SYMBOL(zap_length_uint64);
EXPORT_SYMBOL(zap_remove);
EXPORT_SYMBOL(zap_remove_by_dnode);
EXPORT_SYMBOL(zap_remove_norm);
EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
EXPORT_SYMBOL(zap_count);
EXPORT_SYMBOL(zap_value_search);
EXPORT_SYMBOL(zap_join);
@@ -1695,4 +1816,8 @@ EXPORT_SYMBOL(zap_cursor_advance);
EXPORT_SYMBOL(zap_cursor_serialize);
EXPORT_SYMBOL(zap_cursor_init_serialized);
EXPORT_SYMBOL(zap_get_stats);
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
+ "Maximum micro ZAP size, before converting to a fat ZAP, in bytes");
#endif
diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c
index f724b44baf1d..7c279162a9d1 100644
--- a/sys/contrib/openzfs/module/zfs/zcp.c
+++ b/sys/contrib/openzfs/module/zfs/zcp.c
@@ -108,9 +108,9 @@
#define ZCP_NVLIST_MAX_DEPTH 20
-uint64_t zfs_lua_check_instrlimit_interval = 100;
-unsigned long zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
-unsigned long zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
+static const uint64_t zfs_lua_check_instrlimit_interval = 100;
+uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
+uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
/*
* Forward declarations for mutually recursive functions
@@ -277,9 +277,9 @@ zcp_table_to_nvlist(lua_State *state, int index, int depth)
}
break;
case LUA_TNUMBER:
- VERIFY3U(sizeof (buf), >,
- snprintf(buf, sizeof (buf), "%lld",
- (longlong_t)lua_tonumber(state, -2)));
+ (void) snprintf(buf, sizeof (buf), "%lld",
+ (longlong_t)lua_tonumber(state, -2));
+
key = buf;
if (saw_str_could_collide) {
key_could_collide = B_TRUE;
@@ -544,7 +544,7 @@ zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair,
fnvpair_value_nvlist(pair), errbuf, errbuf_len);
break;
case DATA_TYPE_STRING_ARRAY: {
- char **strarr;
+ const char **strarr;
uint_t nelem;
(void) nvpair_value_string_array(pair, &strarr, &nelem);
lua_newtable(state);
@@ -622,7 +622,7 @@ zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname,
*/
dsl_dataset_t *
zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
- void *tag)
+ const void *tag)
{
dsl_dataset_t *ds;
int error = dsl_dataset_hold(dp, dsname, tag, &ds);
@@ -631,11 +631,11 @@ zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
}
static int zcp_debug(lua_State *);
-static zcp_lib_info_t zcp_debug_info = {
+static const zcp_lib_info_t zcp_debug_info = {
.name = "debug",
.func = zcp_debug,
.pargs = {
- { .za_name = "debug string", .za_lua_type = LUA_TSTRING},
+ { .za_name = "debug string", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -648,7 +648,7 @@ zcp_debug(lua_State *state)
{
const char *dbgstring;
zcp_run_info_t *ri = zcp_run_info(state);
- zcp_lib_info_t *libinfo = &zcp_debug_info;
+ const zcp_lib_info_t *libinfo = &zcp_debug_info;
zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
@@ -661,11 +661,11 @@ zcp_debug(lua_State *state)
}
static int zcp_exists(lua_State *);
-static zcp_lib_info_t zcp_exists_info = {
+static const zcp_lib_info_t zcp_exists_info = {
.name = "exists",
.func = zcp_exists,
.pargs = {
- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -678,7 +678,7 @@ zcp_exists(lua_State *state)
{
zcp_run_info_t *ri = zcp_run_info(state);
dsl_pool_t *dp = ri->zri_pool;
- zcp_lib_info_t *libinfo = &zcp_exists_info;
+ const zcp_lib_info_t *libinfo = &zcp_exists_info;
zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
@@ -769,10 +769,10 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
}
}
-/* ARGSUSED */
static void
zcp_lua_counthook(lua_State *state, lua_Debug *ar)
{
+ (void) ar;
lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
zcp_run_info_t *ri = lua_touserdata(state, -1);
@@ -780,8 +780,7 @@ zcp_lua_counthook(lua_State *state, lua_Debug *ar)
* Check if we were canceled while waiting for the
* txg to sync or from our open context thread
*/
- if (ri->zri_canceled ||
- (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
+ if (ri->zri_canceled || (!ri->zri_sync && issig())) {
ri->zri_canceled = B_TRUE;
(void) lua_pushstring(state, "Channel program was canceled.");
(void) lua_error(state);
@@ -958,12 +957,12 @@ zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri)
}
static void
-zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
+zcp_pool_error(zcp_run_info_t *ri, const char *poolname, int error)
{
ri->zri_result = SET_ERROR(ECHRNG);
lua_settop(ri->zri_state, 0);
- (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
- poolname);
+ (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s "
+ "errno: %d", poolname, error);
zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
ZCP_RET_ERROR, &ri->zri_result);
@@ -974,10 +973,10 @@ zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
* The txg_wait_synced_sig will continue to wait for the txg to complete
* after calling this callback.
*/
-/* ARGSUSED */
static void
zcp_eval_sig(void *arg, dmu_tx_t *tx)
{
+ (void) tx;
zcp_run_info_t *ri = arg;
ri->zri_canceled = B_TRUE;
@@ -1013,7 +1012,7 @@ zcp_eval_open(zcp_run_info_t *ri, const char *poolname)
error = dsl_pool_hold(poolname, FTAG, &dp);
if (error != 0) {
- zcp_pool_error(ri, poolname);
+ zcp_pool_error(ri, poolname, error);
return;
}
@@ -1159,7 +1158,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
if (err != 0)
- zcp_pool_error(&runinfo, poolname);
+ zcp_pool_error(&runinfo, poolname, err);
} else {
zcp_eval_open(&runinfo, poolname);
}
@@ -1443,10 +1442,8 @@ zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
}
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, U64, ZMOD_RW,
"Max instruction limit that can be specified for a channel program");
-ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, U64, ZMOD_RW,
"Max memory limit that can be specified for a channel program");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zcp_get.c b/sys/contrib/openzfs/module/zfs/zcp_get.c
index 7256e4de1915..6fd45151d92a 100644
--- a/sys/contrib/openzfs/module/zfs/zcp_get.c
+++ b/sys/contrib/openzfs/module/zfs/zcp_get.c
@@ -76,9 +76,8 @@ get_objset_type(dsl_dataset_t *ds, zfs_type_t *type)
static int
get_objset_type_name(dsl_dataset_t *ds, char *str)
{
- int error;
- zfs_type_t type;
- error = get_objset_type(ds, &type);
+ zfs_type_t type = ZFS_TYPE_INVALID;
+ int error = get_objset_type(ds, &type);
if (error != 0)
return (error);
switch (type) {
@@ -230,7 +229,7 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
"Internal error - setpoint not determined";
- zfs_type_t ds_type;
+ zfs_type_t ds_type = ZFS_TYPE_INVALID;
zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
(void) get_objset_type(ds, &ds_type);
@@ -344,19 +343,13 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
}
break;
case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
- char *token = get_receive_resume_stats_impl(ds);
-
- (void) strlcpy(strval, token, ZAP_MAXVALUELEN);
- if (strcmp(strval, "") == 0) {
- char *childval = get_child_receive_stats(ds);
-
- (void) strlcpy(strval, childval, ZAP_MAXVALUELEN);
- if (strcmp(strval, "") == 0)
- error = ENOENT;
-
- kmem_strfree(childval);
+ char *token = get_receive_resume_token(ds);
+ if (token != NULL) {
+ (void) strlcpy(strval, token, ZAP_MAXVALUELEN);
+ kmem_strfree(token);
+ } else {
+ error = ENOENT;
}
- kmem_strfree(token);
break;
}
case ZFS_PROP_VOLSIZE:
@@ -398,7 +391,7 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
dsl_dataset_crypt_stats(ds, nvl);
if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop),
&propval) == 0) {
- char *source;
+ const char *source;
(void) nvlist_lookup_uint64(propval, ZPROP_VALUE,
&numval);
@@ -410,6 +403,10 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
break;
}
+ case ZFS_PROP_SNAPSHOTS_CHANGED:
+ numval = dsl_dir_snap_cmtime(ds->ds_dir).tv_sec;
+ break;
+
default:
/* Did not match these props, check in the dsl_dir */
error = get_dsl_dir_prop(ds, zfs_prop, &numval);
@@ -470,11 +467,13 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
} else {
error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
1, &numval, setpoint);
-
+ if (error != 0)
+ goto out;
#ifdef _KERNEL
/* Fill in temporary value for prop, if applicable */
(void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint);
#else
+ kmem_free(strval, ZAP_MAXVALUELEN);
return (luaL_error(state,
"temporary properties only supported in kernel mode",
prop_name));
@@ -491,6 +490,7 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
(void) lua_pushnumber(state, numval);
}
}
+out:
kmem_free(strval, ZAP_MAXVALUELEN);
if (error == 0)
get_prop_src(state, setpoint, zfs_prop);
@@ -503,8 +503,7 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
boolean_t
prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
{
- int error;
- zfs_type_t zfs_type;
+ zfs_type_t zfs_type = ZFS_TYPE_INVALID;
/* properties not supported */
if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) ||
@@ -515,7 +514,7 @@ prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
return (B_FALSE);
- error = get_objset_type(ds, &zfs_type);
+ int error = get_objset_type(ds, &zfs_type);
if (error != 0)
return (B_FALSE);
return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE));
@@ -611,8 +610,7 @@ parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type,
*/
int domain_len = strrchr(cp, '-') - cp;
domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
- (void) strncpy(domain_val, cp, domain_len);
- domain_val[domain_len] = '\0';
+ (void) strlcpy(domain_val, cp, domain_len + 1);
cp += domain_len + 1;
(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
@@ -743,12 +741,12 @@ zcp_get_written_prop(lua_State *state, dsl_pool_t *dp,
}
static int zcp_get_prop(lua_State *state);
-static zcp_lib_info_t zcp_get_prop_info = {
+static const zcp_lib_info_t zcp_get_prop_info = {
.name = "get_prop",
.func = zcp_get_prop,
.pargs = {
- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
- { .za_name = "property", .za_lua_type = LUA_TSTRING},
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
+ { .za_name = "property", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -762,7 +760,7 @@ zcp_get_prop(lua_State *state)
const char *dataset_name;
const char *property_name;
dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
- zcp_lib_info_t *libinfo = &zcp_get_prop_info;
+ const zcp_lib_info_t *libinfo = &zcp_get_prop_info;
zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
diff --git a/sys/contrib/openzfs/module/zfs/zcp_iter.c b/sys/contrib/openzfs/module/zfs/zcp_iter.c
index f727c56f212d..2da0bf9740e5 100644
--- a/sys/contrib/openzfs/module/zfs/zcp_iter.c
+++ b/sys/contrib/openzfs/module/zfs/zcp_iter.c
@@ -107,12 +107,12 @@ zcp_clones_iter(lua_State *state)
}
static int zcp_clones_list(lua_State *);
-static zcp_list_info_t zcp_clones_list_info = {
+static const zcp_list_info_t zcp_clones_list_info = {
.name = "clones",
.func = zcp_clones_list,
.gc = NULL,
.pargs = {
- { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+ { .za_name = "snapshot", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -194,12 +194,12 @@ zcp_snapshots_iter(lua_State *state)
}
static int zcp_snapshots_list(lua_State *);
-static zcp_list_info_t zcp_snapshots_list_info = {
+static const zcp_list_info_t zcp_snapshots_list_info = {
.name = "snapshots",
.func = zcp_snapshots_list,
.gc = NULL,
.pargs = {
- { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+ { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -281,12 +281,12 @@ zcp_children_iter(lua_State *state)
}
static int zcp_children_list(lua_State *);
-static zcp_list_info_t zcp_children_list_info = {
+static const zcp_list_info_t zcp_children_list_info = {
.name = "children",
.func = zcp_children_list,
.gc = NULL,
.pargs = {
- { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+ { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -333,7 +333,7 @@ zcp_user_props_list_gc(lua_State *state)
static int
zcp_user_props_iter(lua_State *state)
{
- char *source, *val;
+ const char *source, *val;
nvlist_t *nvprop;
nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1));
nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2));
@@ -361,13 +361,13 @@ zcp_user_props_iter(lua_State *state)
}
static int zcp_user_props_list(lua_State *);
-static zcp_list_info_t zcp_user_props_list_info = {
+static const zcp_list_info_t zcp_user_props_list_info = {
.name = "user_properties",
.func = zcp_user_props_list,
.gc = zcp_user_props_list_gc,
.pargs = {
{ .za_name = "filesystem | snapshot | volume",
- .za_lua_type = LUA_TSTRING},
+ .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -383,13 +383,13 @@ static zcp_list_info_t zcp_user_props_list_info = {
* versions of ZFS, we declare 'properties' as an alias for
* 'user_properties'.
*/
-static zcp_list_info_t zcp_props_list_info = {
+static const zcp_list_info_t zcp_props_list_info = {
.name = "properties",
.func = zcp_user_props_list,
.gc = zcp_user_props_list_gc,
.pargs = {
{ .za_name = "filesystem | snapshot | volume",
- .za_lua_type = LUA_TSTRING},
+ .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -444,11 +444,11 @@ zcp_dataset_system_props(dsl_dataset_t *ds, nvlist_t *nv)
}
static int zcp_system_props_list(lua_State *);
-static zcp_list_info_t zcp_system_props_list_info = {
+static const zcp_list_info_t zcp_system_props_list_info = {
.name = "system_properties",
.func = zcp_system_props_list,
.pargs = {
- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -467,7 +467,7 @@ zcp_system_props_list(lua_State *state)
char errbuf[128];
const char *dataset_name;
dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
- zcp_list_info_t *libinfo = &zcp_system_props_list_info;
+ const zcp_list_info_t *libinfo = &zcp_system_props_list_info;
zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
dataset_name = lua_tostring(state, 1);
nvlist_t *nv = fnvlist_alloc();
@@ -566,11 +566,11 @@ zcp_bookmarks_iter(lua_State *state)
}
static int zcp_bookmarks_list(lua_State *);
-static zcp_list_info_t zcp_bookmarks_list_info = {
+static const zcp_list_info_t zcp_bookmarks_list_info = {
.name = "bookmarks",
.func = zcp_bookmarks_list,
.pargs = {
- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -654,12 +654,12 @@ zcp_holds_iter(lua_State *state)
}
static int zcp_holds_list(lua_State *);
-static zcp_list_info_t zcp_holds_list_info = {
+static const zcp_list_info_t zcp_holds_list_info = {
.name = "holds",
.func = zcp_holds_list,
.gc = NULL,
.pargs = {
- { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+ { .za_name = "snapshot", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -710,8 +710,7 @@ zcp_list_func(lua_State *state)
int
zcp_load_list_lib(lua_State *state)
{
- int i;
- zcp_list_info_t *zcp_list_funcs[] = {
+ const zcp_list_info_t *zcp_list_funcs[] = {
&zcp_children_list_info,
&zcp_snapshots_list_info,
&zcp_user_props_list_info,
@@ -725,8 +724,8 @@ zcp_load_list_lib(lua_State *state)
lua_newtable(state);
- for (i = 0; zcp_list_funcs[i] != NULL; i++) {
- zcp_list_info_t *info = zcp_list_funcs[i];
+ for (int i = 0; zcp_list_funcs[i] != NULL; i++) {
+ const zcp_list_info_t *info = zcp_list_funcs[i];
if (info->gc != NULL) {
/*
@@ -741,10 +740,9 @@ zcp_load_list_lib(lua_State *state)
lua_pop(state, 1);
}
- lua_pushlightuserdata(state, info);
+ lua_pushlightuserdata(state, (void *)(uintptr_t)info);
lua_pushcclosure(state, &zcp_list_func, 1);
lua_setfield(state, -2, info->name);
- info++;
}
return (1);
diff --git a/sys/contrib/openzfs/module/zfs/zcp_synctask.c b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
index c6ade59b9ced..058910054d97 100644
--- a/sys/contrib/openzfs/module/zfs/zcp_synctask.c
+++ b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
@@ -114,25 +114,25 @@ zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc,
static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_destroy_info = {
+static const zcp_synctask_info_t zcp_synctask_destroy_info = {
.name = "destroy",
.func = zcp_synctask_destroy,
.pargs = {
- {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING},
+ {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
- {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
+ {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN },
{NULL, 0}
},
.space_check = ZFS_SPACE_CHECK_DESTROY,
.blocks_modified = 0
};
-/* ARGSUSED */
static int
zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
{
+ (void) err_details;
int err;
const char *dsname = lua_tostring(state, 1);
@@ -167,11 +167,11 @@ zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
}
static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_promote_info = {
+static const zcp_synctask_info_t zcp_synctask_promote_info = {
.name = "promote",
.func = zcp_synctask_promote,
.pargs = {
- {.za_name = "clone", .za_lua_type = LUA_TSTRING},
+ {.za_name = "clone", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -205,13 +205,13 @@ zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details)
}
static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details);
-static zcp_synctask_info_t zcp_synctask_rollback_info = {
+static const zcp_synctask_info_t zcp_synctask_rollback_info = {
.name = "rollback",
.func = zcp_synctask_rollback,
.space_check = ZFS_SPACE_CHECK_RESERVED,
.blocks_modified = 1,
.pargs = {
- {.za_name = "filesystem", .za_lua_type = LUA_TSTRING},
+ {.za_name = "filesystem", .za_lua_type = LUA_TSTRING },
{0, 0}
},
.kwargs = {
@@ -236,12 +236,12 @@ zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details)
}
static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_snapshot_info = {
+static const zcp_synctask_info_t zcp_synctask_snapshot_info = {
.name = "snapshot",
.func = zcp_synctask_snapshot,
.pargs = {
{.za_name = "filesystem@snapname | volume@snapname",
- .za_lua_type = LUA_TSTRING},
+ .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -251,10 +251,10 @@ static zcp_synctask_info_t zcp_synctask_snapshot_info = {
.blocks_modified = 3
};
-/* ARGSUSED */
static int
zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
{
+ (void) err_details;
int err;
dsl_dataset_snapshot_arg_t ddsa = { 0 };
const char *dsname = lua_tostring(state, 1);
@@ -302,9 +302,45 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
return (err);
}
+static int zcp_synctask_rename_snapshot(lua_State *, boolean_t, nvlist_t *);
+static const zcp_synctask_info_t zcp_synctask_rename_snapshot_info = {
+ .name = "rename_snapshot",
+ .func = zcp_synctask_rename_snapshot,
+ .pargs = {
+ {.za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING },
+ {.za_name = "oldsnapname", .za_lua_type = LUA_TSTRING },
+ {.za_name = "newsnapname", .za_lua_type = LUA_TSTRING },
+ {NULL, 0}
+ },
+ .space_check = ZFS_SPACE_CHECK_RESERVED,
+ .blocks_modified = 1
+};
+
+static int
+zcp_synctask_rename_snapshot(lua_State *state, boolean_t sync,
+ nvlist_t *err_details)
+{
+ (void) err_details;
+ int err;
+ const char *fsname = lua_tostring(state, 1);
+ const char *oldsnapname = lua_tostring(state, 2);
+ const char *newsnapname = lua_tostring(state, 3);
+
+ struct dsl_dataset_rename_snapshot_arg ddrsa = { 0 };
+ ddrsa.ddrsa_fsname = fsname;
+ ddrsa.ddrsa_oldsnapname = oldsnapname;
+ ddrsa.ddrsa_newsnapname = newsnapname;
+ ddrsa.ddrsa_recursive = B_FALSE;
+
+ err = zcp_sync_task(state, dsl_dataset_rename_snapshot_check,
+ dsl_dataset_rename_snapshot_sync, &ddrsa, sync, NULL);
+
+ return (err);
+}
+
static int zcp_synctask_inherit_prop(lua_State *, boolean_t,
nvlist_t *err_details);
-static zcp_synctask_info_t zcp_synctask_inherit_prop_info = {
+static const zcp_synctask_info_t zcp_synctask_inherit_prop_info = {
.name = "inherit",
.func = zcp_synctask_inherit_prop,
.space_check = ZFS_SPACE_CHECK_RESERVED,
@@ -325,7 +361,7 @@ zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx)
zcp_inherit_prop_arg_t *args = arg;
zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop);
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (zfs_prop_user(args->zipa_prop))
return (0);
@@ -354,6 +390,7 @@ static int
zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
nvlist_t *err_details)
{
+ (void) err_details;
int err;
zcp_inherit_prop_arg_t zipa = { 0 };
dsl_props_set_arg_t *dpsa = &zipa.zipa_dpsa;
@@ -381,12 +418,12 @@ zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
}
static int zcp_synctask_bookmark(lua_State *, boolean_t, nvlist_t *);
-static zcp_synctask_info_t zcp_synctask_bookmark_info = {
+static const zcp_synctask_info_t zcp_synctask_bookmark_info = {
.name = "bookmark",
.func = zcp_synctask_bookmark,
.pargs = {
- {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING},
- {.za_name = "bookmark", .za_lua_type = LUA_TSTRING},
+ {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING },
+ {.za_name = "bookmark", .za_lua_type = LUA_TSTRING },
{NULL, 0}
},
.kwargs = {
@@ -396,10 +433,10 @@ static zcp_synctask_info_t zcp_synctask_bookmark_info = {
.blocks_modified = 1,
};
-/* ARGSUSED */
static int
zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
{
+ (void) err_details;
int err;
const char *source = lua_tostring(state, 1);
const char *new = lua_tostring(state, 2);
@@ -424,15 +461,15 @@ zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
}
static int zcp_synctask_set_prop(lua_State *, boolean_t, nvlist_t *err_details);
-static zcp_synctask_info_t zcp_synctask_set_prop_info = {
+static const zcp_synctask_info_t zcp_synctask_set_prop_info = {
.name = "set_prop",
.func = zcp_synctask_set_prop,
.space_check = ZFS_SPACE_CHECK_RESERVED,
.blocks_modified = 2,
.pargs = {
- { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
- { .za_name = "property", .za_lua_type = LUA_TSTRING},
- { .za_name = "value", .za_lua_type = LUA_TSTRING},
+ { .za_name = "dataset", .za_lua_type = LUA_TSTRING },
+ { .za_name = "property", .za_lua_type = LUA_TSTRING },
+ { .za_name = "value", .za_lua_type = LUA_TSTRING },
{ NULL, 0 }
},
.kwargs = {
@@ -443,6 +480,7 @@ static zcp_synctask_info_t zcp_synctask_set_prop_info = {
static int
zcp_synctask_set_prop(lua_State *state, boolean_t sync, nvlist_t *err_details)
{
+ (void) err_details;
int err;
zcp_set_prop_arg_t args = { 0 };
@@ -522,12 +560,12 @@ zcp_synctask_wrapper(lua_State *state)
int
zcp_load_synctask_lib(lua_State *state, boolean_t sync)
{
- int i;
- zcp_synctask_info_t *zcp_synctask_funcs[] = {
+ const zcp_synctask_info_t *zcp_synctask_funcs[] = {
&zcp_synctask_destroy_info,
&zcp_synctask_promote_info,
&zcp_synctask_rollback_info,
&zcp_synctask_snapshot_info,
+ &zcp_synctask_rename_snapshot_info,
&zcp_synctask_inherit_prop_info,
&zcp_synctask_bookmark_info,
&zcp_synctask_set_prop_info,
@@ -536,13 +574,12 @@ zcp_load_synctask_lib(lua_State *state, boolean_t sync)
lua_newtable(state);
- for (i = 0; zcp_synctask_funcs[i] != NULL; i++) {
- zcp_synctask_info_t *info = zcp_synctask_funcs[i];
- lua_pushlightuserdata(state, info);
+ for (int i = 0; zcp_synctask_funcs[i] != NULL; i++) {
+ const zcp_synctask_info_t *info = zcp_synctask_funcs[i];
+ lua_pushlightuserdata(state, (void *)(uintptr_t)info);
lua_pushboolean(state, sync);
lua_pushcclosure(state, &zcp_synctask_wrapper, 2);
lua_setfield(state, -2, info->name);
- info++;
}
return (1);
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
index 9d16fff81d0a..1d25bc406866 100644
--- a/sys/contrib/openzfs/module/zfs/zfeature.c
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -389,6 +389,13 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
!spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) &&
feature->fi_feature == SPA_FEATURE_BOOKMARK_V2)
spa->spa_errata = 0;
+
+ /*
+ * Convert the old on-disk error log to the new format when activating
+ * the head_errlog feature.
+ */
+ if (feature->fi_feature == SPA_FEATURE_HEAD_ERRLOG)
+ spa_upgrade_errlog(spa, tx);
}
static void
diff --git a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
index cd35849c3f37..8666883f09a2 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -36,9 +36,7 @@ static
void
zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
{
- int i;
-
- for (i = 0; i != ace_cnt; i++, ace++) {
+ for (int i = 0; i != ace_cnt; i++, ace++) {
ace->a_who = BSWAP_32(ace->a_who);
ace->a_access_mask = BSWAP_32(ace->a_access_mask);
ace->a_flags = BSWAP_16(ace->a_flags);
@@ -138,23 +136,16 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
}
}
-/* ARGSUSED */
void
zfs_oldacl_byteswap(void *buf, size_t size)
{
- int cnt;
-
/*
* Arggh, since we don't know how many ACEs are in
* the array, we have to swap the entire block
*/
-
- cnt = size / sizeof (ace_t);
-
- zfs_oldace_byteswap((ace_t *)buf, cnt);
+ zfs_oldace_byteswap((ace_t *)buf, size / sizeof (ace_t));
}
-/* ARGSUSED */
void
zfs_acl_byteswap(void *buf, size_t size)
{
diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
new file mode 100644
index 000000000000..acedeab7a163
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
@@ -0,0 +1,379 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
+#include <sys/zfs_impl.h>
+
+#include <sys/blake3.h>
+#include <sys/sha2.h>
+
+/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
+#define LIMIT_PERF_MBS 300
+
+typedef struct {
+ const char *name;
+ const char *impl;
+ uint64_t bs1k;
+ uint64_t bs4k;
+ uint64_t bs16k;
+ uint64_t bs64k;
+ uint64_t bs256k;
+ uint64_t bs1m;
+ uint64_t bs4m;
+ uint64_t bs16m;
+ zio_cksum_salt_t salt;
+ zio_checksum_t *(func);
+ zio_checksum_tmpl_init_t *(init);
+ zio_checksum_tmpl_free_t *(free);
+} chksum_stat_t;
+
+static chksum_stat_t *chksum_stat_data = 0;
+static int chksum_stat_cnt = 0;
+static kstat_t *chksum_kstat = NULL;
+
+/*
+ * Sample output on i3-1005G1 System:
+ *
+ * implementation 1k 4k 16k 64k 256k 1m 4m 16m
+ * edonr-generic 1278 1625 1769 1776 1783 1778 1771 1767
+ * skein-generic 548 594 613 623 621 623 621 486
+ * sha256-generic 255 270 281 278 279 281 283 283
+ * sha256-x64 288 310 316 317 318 317 317 316
+ * sha256-ssse3 304 342 351 355 356 357 356 356
+ * sha256-avx 311 348 359 362 362 363 363 362
+ * sha256-avx2 330 378 389 395 395 395 395 395
+ * sha256-shani 908 1127 1212 1230 1233 1234 1223 1230
+ * sha512-generic 359 409 431 427 429 430 428 423
+ * sha512-x64 420 473 490 496 497 497 496 495
+ * sha512-avx 406 522 546 560 560 560 556 560
+ * sha512-avx2 464 568 601 606 609 610 607 608
+ * blake3-generic 330 327 324 323 324 320 323 322
+ * blake3-sse2 424 1366 1449 1468 1458 1453 1395 1408
+ * blake3-sse41 453 1554 1658 1703 1689 1669 1622 1630
+ * blake3-avx2 452 2013 3225 3351 3356 3261 3076 3101
+ * blake3-avx512 498 2869 5269 5926 5872 5643 5014 5005
+ */
+static int
+chksum_kstat_headers(char *buf, size_t size)
+{
+ ssize_t off = 0;
+
+ off += kmem_scnprintf(buf + off, size, "%-23s", "implementation");
+ off += kmem_scnprintf(buf + off, size - off, "%8s", "1k");
+ off += kmem_scnprintf(buf + off, size - off, "%8s", "4k");
+ off += kmem_scnprintf(buf + off, size - off, "%8s", "16k");
+ off += kmem_scnprintf(buf + off, size - off, "%8s", "64k");
+ off += kmem_scnprintf(buf + off, size - off, "%8s", "256k");
+ off += kmem_scnprintf(buf + off, size - off, "%8s", "1m");
+ off += kmem_scnprintf(buf + off, size - off, "%8s", "4m");
+ (void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m");
+
+ return (0);
+}
+
+static int
+chksum_kstat_data(char *buf, size_t size, void *data)
+{
+ chksum_stat_t *cs;
+ ssize_t off = 0;
+ char b[24];
+
+ cs = (chksum_stat_t *)data;
+ kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl);
+ off += kmem_scnprintf(buf + off, size - off, "%-23s", b);
+ off += kmem_scnprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs1k);
+ off += kmem_scnprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs4k);
+ off += kmem_scnprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs16k);
+ off += kmem_scnprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs64k);
+ off += kmem_scnprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs256k);
+ off += kmem_scnprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs1m);
+ off += kmem_scnprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs4m);
+ (void) kmem_scnprintf(buf + off, size - off, "%8llu\n",
+ (u_longlong_t)cs->bs16m);
+
+ return (0);
+}
+
+static void *
+chksum_kstat_addr(kstat_t *ksp, loff_t n)
+{
+ if (n < chksum_stat_cnt)
+ ksp->ks_private = (void *)(chksum_stat_data + n);
+ else
+ ksp->ks_private = NULL;
+
+ return (ksp->ks_private);
+}
+
+static void
+chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
+ uint64_t *result)
+{
+ hrtime_t start;
+ uint64_t run_bw, run_time_ns, run_count = 0, size = 0;
+ uint32_t l, loops = 0;
+ zio_cksum_t zcp;
+
+ switch (round) {
+ case 1: /* 1k */
+ size = 1<<10; loops = 128; break;
+ case 2: /* 2k */
+ size = 1<<12; loops = 64; break;
+ case 3: /* 4k */
+ size = 1<<14; loops = 32; break;
+ case 4: /* 16k */
+ size = 1<<16; loops = 16; break;
+ case 5: /* 256k */
+ size = 1<<18; loops = 8; break;
+ case 6: /* 1m */
+ size = 1<<20; loops = 4; break;
+ case 7: /* 4m */
+ size = 1<<22; loops = 1; break;
+ case 8: /* 16m */
+ size = 1<<24; loops = 1; break;
+ }
+
+ kpreempt_disable();
+ start = gethrtime();
+ do {
+ for (l = 0; l < loops; l++, run_count++)
+ cs->func(abd, size, ctx, &zcp);
+
+ run_time_ns = gethrtime() - start;
+ } while (run_time_ns < MSEC2NSEC(1));
+ kpreempt_enable();
+
+ run_bw = size * run_count * NANOSEC;
+ run_bw /= run_time_ns; /* B/s */
+ *result = run_bw/1024/1024; /* MiB/s */
+}
+
+#define LIMIT_INIT 0
+#define LIMIT_NEEDED 1
+#define LIMIT_NOLIMIT 2
+
+static void
+chksum_benchit(chksum_stat_t *cs)
+{
+ abd_t *abd;
+ void *ctx = 0;
+ void *salt = &cs->salt.zcs_bytes;
+ static int chksum_stat_limit = LIMIT_INIT;
+
+ memset(salt, 0, sizeof (cs->salt.zcs_bytes));
+ if (cs->init)
+ ctx = cs->init(&cs->salt);
+
+ /* allocate test memory via abd linear interface */
+ abd = abd_alloc_linear(1<<20, B_FALSE);
+ chksum_run(cs, abd, ctx, 1, &cs->bs1k);
+ chksum_run(cs, abd, ctx, 2, &cs->bs4k);
+ chksum_run(cs, abd, ctx, 3, &cs->bs16k);
+ chksum_run(cs, abd, ctx, 4, &cs->bs64k);
+ chksum_run(cs, abd, ctx, 5, &cs->bs256k);
+
+ /* check if we ran on a slow cpu */
+ if (chksum_stat_limit == LIMIT_INIT) {
+ if (cs->bs1k < LIMIT_PERF_MBS) {
+ chksum_stat_limit = LIMIT_NEEDED;
+ } else {
+ chksum_stat_limit = LIMIT_NOLIMIT;
+ }
+ }
+
+ /* skip benchmarks >= 1MiB when the CPU is to slow */
+ if (chksum_stat_limit == LIMIT_NEEDED)
+ goto abort;
+
+ chksum_run(cs, abd, ctx, 6, &cs->bs1m);
+ abd_free(abd);
+
+ /* allocate test memory via abd non linear interface */
+ abd = abd_alloc(1<<24, B_FALSE);
+ chksum_run(cs, abd, ctx, 7, &cs->bs4m);
+ chksum_run(cs, abd, ctx, 8, &cs->bs16m);
+
+abort:
+ abd_free(abd);
+
+ /* free up temp memory */
+ if (cs->free)
+ cs->free(ctx);
+}
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+chksum_benchmark(void)
+{
+#ifndef _KERNEL
+ /* we need the benchmark only for the kernel module */
+ return;
+#endif
+
+ chksum_stat_t *cs;
+ uint64_t max;
+ uint32_t id, cbid = 0, id_save;
+ const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3");
+ const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
+ const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
+
+ /* count implementations */
+ chksum_stat_cnt = 2;
+ chksum_stat_cnt += sha256->getcnt();
+ chksum_stat_cnt += sha512->getcnt();
+ chksum_stat_cnt += blake3->getcnt();
+ chksum_stat_data = kmem_zalloc(
+ sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
+
+ /* edonr - needs to be the first one here (slow CPU check) */
+ cs = &chksum_stat_data[cbid++];
+
+ /* edonr */
+ cs->init = abd_checksum_edonr_tmpl_init;
+ cs->func = abd_checksum_edonr_native;
+ cs->free = abd_checksum_edonr_tmpl_free;
+ cs->name = "edonr";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* skein */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_skein_tmpl_init;
+ cs->func = abd_checksum_skein_native;
+ cs->free = abd_checksum_skein_tmpl_free;
+ cs->name = "skein";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* sha256 */
+ id_save = sha256->getid();
+ for (max = 0, id = 0; id < sha256->getcnt(); id++) {
+ sha256->setid(id);
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_sha256;
+ cs->free = 0;
+ cs->name = sha256->name;
+ cs->impl = sha256->getname();
+ chksum_benchit(cs);
+ if (cs->bs256k > max) {
+ max = cs->bs256k;
+ sha256->set_fastest(id);
+ }
+ }
+ sha256->setid(id_save);
+
+ /* sha512 */
+ id_save = sha512->getid();
+ for (max = 0, id = 0; id < sha512->getcnt(); id++) {
+ sha512->setid(id);
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_sha512_native;
+ cs->free = 0;
+ cs->name = sha512->name;
+ cs->impl = sha512->getname();
+ chksum_benchit(cs);
+ if (cs->bs256k > max) {
+ max = cs->bs256k;
+ sha512->set_fastest(id);
+ }
+ }
+ sha512->setid(id_save);
+
+ /* blake3 */
+ id_save = blake3->getid();
+ for (max = 0, id = 0; id < blake3->getcnt(); id++) {
+ blake3->setid(id);
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_blake3_tmpl_init;
+ cs->func = abd_checksum_blake3_native;
+ cs->free = abd_checksum_blake3_tmpl_free;
+ cs->name = blake3->name;
+ cs->impl = blake3->getname();
+ chksum_benchit(cs);
+ if (cs->bs256k > max) {
+ max = cs->bs256k;
+ blake3->set_fastest(id);
+ }
+ }
+ blake3->setid(id_save);
+}
+
+void
+chksum_init(void)
+{
+#ifdef _KERNEL
+ blake3_per_cpu_ctx_init();
+#endif
+
+ /* Benchmark supported implementations */
+ chksum_benchmark();
+
+ /* Install kstats for all implementations */
+ chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ if (chksum_kstat != NULL) {
+ chksum_kstat->ks_data = NULL;
+ chksum_kstat->ks_ndata = UINT32_MAX;
+ kstat_set_raw_ops(chksum_kstat,
+ chksum_kstat_headers,
+ chksum_kstat_data,
+ chksum_kstat_addr);
+ kstat_install(chksum_kstat);
+ }
+}
+
+void
+chksum_fini(void)
+{
+ if (chksum_kstat != NULL) {
+ kstat_delete(chksum_kstat);
+ chksum_kstat = NULL;
+ }
+
+ if (chksum_stat_cnt) {
+ kmem_free(chksum_stat_data,
+ sizeof (chksum_stat_t) * chksum_stat_cnt);
+ chksum_stat_cnt = 0;
+ chksum_stat_data = 0;
+ }
+
+#ifdef _KERNEL
+ blake3_per_cpu_ctx_fini();
+#endif
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
index 007f31b4e7b3..2f43c4aa41b8 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fm.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -59,7 +59,7 @@
* read I/Os, there are basically three 'types' of I/O, which form a roughly
* layered diagram:
*
- * +---------------+
+ * +---------------+
* | Aggregate I/O | No associated logical data or device
* +---------------+
* |
@@ -124,14 +124,14 @@ static taskqid_t recent_events_cleaner_tqid;
* This setting can be changed dynamically and setting it to zero
* disables duplicate detection.
*/
-unsigned int zfs_zevent_retain_max = 2000;
+static unsigned int zfs_zevent_retain_max = 2000;
/*
* The lifespan for a recent ereport entry. The default of 15 minutes is
* intended to outlive the zfs diagnosis engine's threshold of 10 errors
* over a period of 10 minutes.
*/
-unsigned int zfs_zevent_retain_expire_secs = 900;
+static unsigned int zfs_zevent_retain_expire_secs = 900;
typedef enum zfs_subclass {
ZSC_IO,
@@ -200,12 +200,53 @@ recent_events_compare(const void *a, const void *b)
return (0);
}
+/*
+ * workaround: vdev properties don't have inheritance
+ */
+static uint64_t
+vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
+{
+ uint64_t propdef, propval;
+
+ propdef = vdev_prop_default_numeric(prop);
+ switch (prop) {
+ case VDEV_PROP_CHECKSUM_N:
+ propval = vd->vdev_checksum_n;
+ break;
+ case VDEV_PROP_CHECKSUM_T:
+ propval = vd->vdev_checksum_t;
+ break;
+ case VDEV_PROP_IO_N:
+ propval = vd->vdev_io_n;
+ break;
+ case VDEV_PROP_IO_T:
+ propval = vd->vdev_io_t;
+ break;
+ case VDEV_PROP_SLOW_IO_N:
+ propval = vd->vdev_slow_io_n;
+ break;
+ case VDEV_PROP_SLOW_IO_T:
+ propval = vd->vdev_slow_io_t;
+ break;
+ default:
+ propval = propdef;
+ break;
+ }
+
+ if (propval != propdef)
+ return (propval);
+
+ if (vd->vdev_parent == NULL)
+ return (propdef);
+
+ return (vdev_prop_get_inherited(vd->vdev_parent, prop));
+}
+
static void zfs_ereport_schedule_cleaner(void);
/*
* background task to clean stale recent event nodes.
*/
-/*ARGSUSED*/
static void
zfs_ereport_cleaner(void *arg)
{
@@ -254,7 +295,6 @@ void
zfs_ereport_clear(spa_t *spa, vdev_t *vd)
{
uint64_t vdev_guid, pool_guid;
- int cnt = 0;
ASSERT(vd != NULL || spa != NULL);
if (vd == NULL) {
@@ -278,7 +318,6 @@ zfs_ereport_clear(spa_t *spa, vdev_t *vd)
avl_remove(&recent_events_tree, entry);
list_remove(&recent_events_list, entry);
kmem_free(entry, sizeof (*entry));
- cnt++;
}
}
@@ -665,6 +704,69 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
DATA_TYPE_UINT64, zb->zb_blkid, NULL);
}
+ /*
+ * Payload for tuning the zed
+ */
+ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
+ uint64_t cksum_n, cksum_t;
+
+ cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
+ if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
+ DATA_TYPE_UINT64,
+ cksum_n,
+ NULL);
+
+ cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
+ if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
+ DATA_TYPE_UINT64,
+ cksum_t,
+ NULL);
+ }
+
+ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
+ uint64_t io_n, io_t;
+
+ io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
+ if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
+ DATA_TYPE_UINT64,
+ io_n,
+ NULL);
+
+ io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
+ if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
+ DATA_TYPE_UINT64,
+ io_t,
+ NULL);
+ }
+
+ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+ uint64_t slow_io_n, slow_io_t;
+
+ slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
+ if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
+ DATA_TYPE_UINT64,
+ slow_io_n,
+ NULL);
+
+ slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
+ if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
+ DATA_TYPE_UINT64,
+ slow_io_t,
+ NULL);
+ }
+
mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport;
@@ -678,10 +780,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
#define MAX_RANGES 16
typedef struct zfs_ecksum_info {
- /* histograms of set and cleared bits by bit number in a 64-bit word */
- uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
- uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
-
/* inline arrays of bits set and cleared. */
uint64_t zei_bits_set[ZFM_MAX_INLINE];
uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
@@ -705,7 +803,7 @@ typedef struct zfs_ecksum_info {
} zfs_ecksum_info_t;
static void
-update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
+update_bad_bits(uint64_t value_arg, uint32_t *count)
{
size_t i;
size_t bits = 0;
@@ -713,10 +811,8 @@ update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
/* We store the bits in big-endian (largest-first) order */
for (i = 0; i < 64; i++) {
- if (value & (1ull << i)) {
- hist[63 - i]++;
+ if (value & (1ull << i))
++bits;
- }
}
/* update the count of bits changed */
*count += bits;
@@ -826,9 +922,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
const uint64_t *good;
const uint64_t *bad;
- uint64_t allset = 0;
- uint64_t allcleared = 0;
-
size_t nui64s = size / sizeof (uint64_t);
size_t inline_size;
@@ -847,14 +940,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
if (info != NULL && info->zbc_has_cksum) {
fm_payload_set(ereport,
- FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
- DATA_TYPE_UINT64_ARRAY,
- sizeof (info->zbc_expected) / sizeof (uint64_t),
- (uint64_t *)&info->zbc_expected,
- FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
- DATA_TYPE_UINT64_ARRAY,
- sizeof (info->zbc_actual) / sizeof (uint64_t),
- (uint64_t *)&info->zbc_actual,
FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
DATA_TYPE_STRING,
info->zbc_checksum_name,
@@ -930,9 +1015,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
// bits set in good, but not in bad
cleared = (good[idx] & (~bad[idx]));
- allset |= set;
- allcleared |= cleared;
-
if (!no_inline) {
ASSERT3U(offset, <, inline_size);
eip->zei_bits_set[offset] = set;
@@ -940,10 +1022,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
offset++;
}
- update_histogram(set, eip->zei_histogram_set,
- &eip->zei_range_sets[range]);
- update_histogram(cleared, eip->zei_histogram_cleared,
- &eip->zei_range_clears[range]);
+ update_bad_bits(set, &eip->zei_range_sets[range]);
+ update_bad_bits(cleared, &eip->zei_range_clears[range]);
}
/* convert to byte offsets */
@@ -979,23 +1059,14 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
DATA_TYPE_UINT8_ARRAY,
inline_size, (uint8_t *)eip->zei_bits_cleared,
NULL);
- } else {
- fm_payload_set(ereport,
- FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
- DATA_TYPE_UINT32_ARRAY,
- NBBY * sizeof (uint64_t), eip->zei_histogram_set,
- FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
- DATA_TYPE_UINT32_ARRAY,
- NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
- NULL);
}
return (eip);
}
#else
-/*ARGSUSED*/
void
zfs_ereport_clear(spa_t *spa, vdev_t *vd)
{
+ (void) spa, (void) vd;
}
#endif
@@ -1025,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
return (B_FALSE);
if (zio != NULL) {
- /*
- * If this is not a read or write zio, ignore the error. This
- * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
- */
+ /* If this is not a read or write zio, ignore the error */
if (zio->io_type != ZIO_TYPE_READ &&
zio->io_type != ZIO_TYPE_WRITE)
return (B_FALSE);
@@ -1072,6 +1140,8 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
(zio != NULL) && (!zio->io_timestamp)) {
return (B_FALSE);
}
+#else
+ (void) subclass, (void) spa, (void) vd, (void) zio;
#endif
return (B_TRUE);
}
@@ -1112,6 +1182,9 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
/* Cleanup is handled by the callback function */
rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+#else
+ (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
+ (void) state;
#endif
return (rc);
}
@@ -1141,6 +1214,8 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
return (SET_ERROR(EBUSY));
+#else
+ (void) zb, (void) offset;
#endif
report = kmem_zalloc(sizeof (*report), KM_SLEEP);
@@ -1150,7 +1225,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
/* copy the checksum failure information if it was provided */
if (info != NULL) {
report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
- bcopy(info, report->zcr_ckinfo, sizeof (*info));
+ memcpy(report->zcr_ckinfo, info, sizeof (*info));
}
report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
@@ -1193,6 +1268,9 @@ zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
report->zcr_ereport = report->zcr_detector = NULL;
if (info != NULL)
kmem_free(info, sizeof (*info));
+#else
+ (void) report, (void) good_data, (void) bad_data,
+ (void) drop_if_identical;
#endif
}
@@ -1257,6 +1335,9 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
kmem_free(info, sizeof (*info));
}
+#else
+ (void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
+ (void) length, (void) good_data, (void) bad_data, (void) zbc;
#endif
return (rc);
}
@@ -1321,7 +1402,8 @@ zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
(void) nvlist_add_nvpair(resource, elem);
}
-
+#else
+ (void) spa, (void) vd, (void) type, (void) name, (void) aux;
#endif
return (resource);
}
@@ -1336,6 +1418,8 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
resource = zfs_event_create(spa, vd, type, name, aux);
if (resource)
zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
+#else
+ (void) spa, (void) vd, (void) type, (void) name, (void) aux;
#endif
}
@@ -1380,17 +1464,17 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
aux = fm_nvlist_create(NULL);
if (vd && aux) {
if (vd->vdev_physpath) {
- (void) nvlist_add_string(aux,
+ fnvlist_add_string(aux,
FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
vd->vdev_physpath);
}
if (vd->vdev_enc_sysfs_path) {
- (void) nvlist_add_string(aux,
+ fnvlist_add_string(aux,
FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
vd->vdev_enc_sysfs_path);
}
- (void) nvlist_add_uint64(aux,
+ fnvlist_add_uint64(aux,
FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
}
@@ -1399,6 +1483,8 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
if (aux)
fm_nvlist_destroy(aux, FM_NVA_FREE);
+#else
+ (void) spa, (void) vd, (void) laststate;
#endif
}
@@ -1434,9 +1520,8 @@ zfs_ereport_fini(void)
{
recent_events_node_t *entry;
- while ((entry = list_head(&recent_events_list)) != NULL) {
+ while ((entry = list_remove_head(&recent_events_list)) != NULL) {
avl_remove(&recent_events_tree, entry);
- list_remove(&recent_events_list, entry);
kmem_free(entry, sizeof (*entry));
}
avl_destroy(&recent_events_tree);
@@ -1450,7 +1535,7 @@ zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
nvlist_t *aux;
aux = fm_nvlist_create(NULL);
- nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
+ fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
fm_nvlist_destroy(aux, FM_NVA_FREE);
@@ -1485,12 +1570,12 @@ zfs_ereport_zvol_post(const char *subclass, const char *name,
return;
aux = fm_nvlist_create(NULL);
- nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
- nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
+ fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
+ fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
raw_name);
r = strchr(name, '/');
if (r && r[1])
- nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
+ fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
fm_nvlist_destroy(aux, FM_NVA_FREE);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
index a90bf5feeea1..add4241dcc99 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -61,7 +61,7 @@ typedef struct fuid_domain {
uint64_t f_idx;
} fuid_domain_t;
-static char *nulldomain = "";
+static const char *const nulldomain = "";
/*
* Compare two indexes.
@@ -133,7 +133,7 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
for (i = 0; i != count; i++) {
fuid_domain_t *domnode;
- char *domain;
+ const char *domain;
uint64_t idx;
VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
@@ -171,7 +171,7 @@ zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
avl_destroy(idx_tree);
}
-char *
+const char *
zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
{
fuid_domain_t searchnode, *findnode;
@@ -258,8 +258,8 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
domnode->f_ksid->kd_name) == 0);
}
- VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
- fuids, numnodes) == 0);
+ fnvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+ (const nvlist_t * const *)fuids, numnodes);
for (i = 0; i != numnodes; i++)
nvlist_free(fuids[i]);
kmem_free(fuids, numnodes * sizeof (void *));
@@ -290,9 +290,9 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
* necessary for the caller or another thread to detect the dirty table
* and sync out the changes.
*/
-int
+static int
zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
- char **retdomain, boolean_t addok)
+ const char **retdomain, boolean_t addok)
{
fuid_domain_t searchnode, *findnode;
avl_index_t loc;
@@ -358,7 +358,7 @@ retry:
const char *
zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
{
- char *domain;
+ const char *domain;
if (idx == 0 || !zfsvfs->z_use_fuids)
return (NULL);
@@ -518,8 +518,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
uint64_t idx;
ksid_t *ksid;
uint32_t rid;
- char *kdomain;
- const char *domain;
+ const char *kdomain, *domain;
uid_t id;
VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
@@ -574,8 +573,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
{
#ifdef HAVE_KSID
- const char *domain;
- char *kdomain;
+ const char *domain, *kdomain;
uint32_t fuid_idx = FUID_INDEX(id);
uint32_t rid = 0;
idmap_stat status;
@@ -624,7 +622,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
rid = FUID_RID(fuidp->z_fuid_group);
idx = FUID_INDEX(fuidp->z_fuid_group);
break;
- };
+ }
domain = fuidp->z_domain_table[idx - 1];
} else {
if (type == ZFS_OWNER || type == ZFS_ACE_USER)
@@ -701,19 +699,15 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
zfs_fuid_t *zfuid;
zfs_fuid_domain_t *zdomain;
- while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
- list_remove(&fuidp->z_fuids, zfuid);
+ while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL)
kmem_free(zfuid, sizeof (zfs_fuid_t));
- }
if (fuidp->z_domain_table != NULL)
kmem_free(fuidp->z_domain_table,
(sizeof (char *)) * fuidp->z_domain_cnt);
- while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
- list_remove(&fuidp->z_domains, zdomain);
+ while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL)
kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
- }
kmem_free(fuidp, sizeof (zfs_fuid_info_t));
}
diff --git a/sys/contrib/openzfs/module/zfs/spa_boot.c b/sys/contrib/openzfs/module/zfs/zfs_impl.c
index 674394650f82..20322ff98b31 100644
--- a/sys/contrib/openzfs/module/zfs/spa_boot.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_impl.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,31 +20,42 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
*/
-#ifdef _KERNEL
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
-#include <sys/zio.h>
-#include <sys/spa_boot.h>
-#include <sys/sunddi.h>
+#include <sys/blake3.h>
+#include <sys/sha2.h>
-char *
-spa_get_bootprop(char *propname)
+/*
+ * impl_ops - backend for implementations of algorithms
+ */
+const zfs_impl_t *impl_ops[] = {
+ &zfs_blake3_ops,
+ &zfs_sha256_ops,
+ &zfs_sha512_ops,
+ NULL
+};
+
+/*
+ * zfs_impl_get_ops - Get the API functions for an impl backend
+ */
+const zfs_impl_t *
+zfs_impl_get_ops(const char *algo)
{
- char *value;
+ const zfs_impl_t **ops = impl_ops;
- if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
- DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS)
- return (NULL);
- return (value);
-}
+ if (!algo || !*algo)
+ return (*ops);
-void
-spa_free_bootprop(char *value)
-{
- ddi_prop_free(value);
-}
+ for (; *ops; ops++) {
+ if (strcmp(algo, (*ops)->name) == 0)
+ break;
+ }
-#endif /* _KERNEL */
+ ASSERT3P(ops, !=, NULL);
+ return (*ops);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 96a021acbc95..7b527eb75e83 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -23,11 +23,11 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright 2011 Martin Matuska
* Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2012 Pawel Jakub Dawidek
* Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
@@ -38,8 +38,9 @@
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
* Copyright (c) 2019 Datto Inc.
* Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2021, 2024, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright 2024 Oxide Computer Company
*/
/*
@@ -222,23 +223,22 @@
#include <sys/zfs_ioctl_impl.h>
kmutex_t zfsdev_state_lock;
-zfsdev_state_t *zfsdev_state_list;
+static zfsdev_state_t zfsdev_state_listhead;
/*
* Limit maximum nvlist size. We don't want users passing in insane values
* for zc->zc_nvlist_src_size, since we will need to allocate that much memory.
* Defaults to 0=auto which is handled by platform code.
*/
-unsigned long zfs_max_nvlist_src_size = 0;
+uint64_t zfs_max_nvlist_src_size = 0;
/*
* When logging the output nvlist of an ioctl in the on-disk history, limit
* the logged size to this many bytes. This must be less than DMU_MAX_ACCESS.
* This applies primarily to zfs_ioc_channel_program().
*/
-unsigned long zfs_history_output_max = 1024 * 1024;
+static uint64_t zfs_history_output_max = 1024 * 1024;
-uint_t zfs_fsyncer_key;
uint_t zfs_allow_log_key;
/* DATA_TYPE_ANY is used when zkey_type can vary. */
@@ -373,10 +373,10 @@ zfs_log_history(zfs_cmd_t *zc)
* Policy for top-level read operations (list pools). Requires no privileges,
* and can be used in the local zone, as there is no associated dataset.
*/
-/* ARGSUSED */
static int
zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc, (void) innvl, (void) cr;
return (0);
}
@@ -384,10 +384,10 @@ zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
* Policy for dataset read operations (list children, get statistics). Requires
* no privileges, but must be visible in the local zone.
*/
-/* ARGSUSED */
static int
zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl, (void) cr;
if (INGLOBALZONE(curproc) ||
zone_dataset_visible(zc->zc_name, NULL))
return (0);
@@ -604,7 +604,7 @@ static int
zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
cred_t *cr)
{
- char *strval;
+ const char *strval;
/*
* Check permissions for special properties.
@@ -656,35 +656,29 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
- int error;
-
- error = zfs_dozonecheck(zc->zc_name, cr);
- if (error != 0)
- return (error);
-
/*
* permission to set permissions will be evaluated later in
* dsl_deleg_can_allow()
*/
- return (0);
+ (void) innvl;
+ return (zfs_dozonecheck(zc->zc_name, cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_ROLLBACK, cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
dsl_pool_t *dp;
dsl_dataset_t *ds;
const char *cp;
@@ -717,10 +711,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (error);
}
-/* ARGSUSED */
static int
zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
return (zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_SEND, cr));
}
@@ -728,12 +722,14 @@ zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
static int
zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc, (void) innvl, (void) cr;
return (SET_ERROR(ENOTSUP));
}
static int
zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc, (void) innvl, (void) cr;
return (SET_ERROR(ENOTSUP));
}
@@ -745,7 +741,7 @@ zfs_get_parent(const char *datasetname, char *parent, int parentsize)
/*
* Remove the @bla or /bla from the end of the name to get the parent.
*/
- (void) strncpy(parent, datasetname, parentsize);
+ (void) strlcpy(parent, datasetname, parentsize);
cp = strrchr(parent, '@');
if (cp != NULL) {
cp[0] = '\0';
@@ -771,10 +767,10 @@ zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
}
@@ -782,10 +778,10 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
* Destroying snapshots with delegated permissions requires
* descendant mount and destroy permissions.
*/
-/* ARGSUSED */
static int
zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc;
nvlist_t *snaps;
nvpair_t *pair, *nextpair;
int error = 0;
@@ -844,17 +840,17 @@ zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
return (error);
}
-/* ARGSUSED */
static int
zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
dsl_pool_t *dp;
dsl_dataset_t *clone;
int error;
@@ -899,10 +895,10 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (error);
}
-/* ARGSUSED */
static int
zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
int error;
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
@@ -917,13 +913,6 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
ZFS_DELEG_PERM_CREATE, cr));
}
-/* ARGSUSED */
-static int
-zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
-{
- return (zfs_secpolicy_recv(zc, innvl, cr));
-}
-
int
zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
{
@@ -934,10 +923,10 @@ zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
/*
* Check for permission to create each snapshot in the nvlist.
*/
-/* ARGSUSED */
static int
zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc;
nvlist_t *snaps;
int error = 0;
nvpair_t *pair;
@@ -946,7 +935,7 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
- char *name = nvpair_name(pair);
+ char *name = (char *)nvpair_name(pair);
char *atp = strchr(name, '@');
if (atp == NULL) {
@@ -965,15 +954,15 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
/*
* Check for permission to create each bookmark in the nvlist.
*/
-/* ARGSUSED */
static int
zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc;
int error = 0;
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
- char *name = nvpair_name(pair);
+ char *name = (char *)nvpair_name(pair);
char *hashp = strchr(name, '#');
if (hashp == NULL) {
@@ -990,16 +979,16 @@ zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (error);
}
-/* ARGSUSED */
static int
zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc;
nvpair_t *pair, *nextpair;
int error = 0;
for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
pair = nextpair) {
- char *name = nvpair_name(pair);
+ char *name = (char *)nvpair_name(pair);
char *hashp = strchr(name, '#');
nextpair = nvlist_next_nvpair(innvl, pair);
@@ -1031,10 +1020,10 @@ zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (error);
}
-/* ARGSUSED */
static int
zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc, (void) innvl, (void) cr;
/*
* Even root must have a proper TSD so that we know what pool
* to log to.
@@ -1047,9 +1036,9 @@ zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
static int
zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
- char parentname[ZFS_MAX_DATASET_NAME_LEN];
- int error;
- char *origin;
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+ const char *origin;
if ((error = zfs_get_parent(zc->zc_name, parentname,
sizeof (parentname))) != 0)
@@ -1072,10 +1061,11 @@ zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
* Policy for pool operations - create/destroy pools, add vdevs, etc. Requires
* SYS_CONFIG privilege, which is not available in a local zone.
*/
-/* ARGSUSED */
int
zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc, (void) innvl;
+
if (secpolicy_sys_config(cr, B_FALSE) != 0)
return (SET_ERROR(EPERM));
@@ -1085,13 +1075,13 @@ zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
/*
* Policy for object to name lookups.
*/
-/* ARGSUSED */
static int
zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
int error;
- if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+ if (secpolicy_sys_config(cr, B_FALSE) == 0)
return (0);
error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
@@ -1101,20 +1091,20 @@ zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
/*
* Policy for fault injection. Requires all privileges.
*/
-/* ARGSUSED */
static int
zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc, (void) innvl;
return (secpolicy_zinject(cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (!zfs_prop_user(zc->zc_value))
return (SET_ERROR(EINVAL));
return (zfs_secpolicy_write_perms(zc->zc_name,
@@ -1174,18 +1164,18 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
userquota_perms[zc->zc_objset_type], cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) innvl;
return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
NULL, cr));
}
-/* ARGSUSED */
static int
zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc;
nvpair_t *pair;
nvlist_t *holds;
int error;
@@ -1206,10 +1196,10 @@ zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (0);
}
-/* ARGSUSED */
static int
zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ (void) zc;
nvpair_t *pair;
int error;
@@ -1240,8 +1230,8 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
*/
int error;
- if ((error = zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_DIFF, cr)) == 0)
+ if (zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_DIFF, cr) == 0)
return (0);
error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
@@ -1289,8 +1279,7 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
packed = vmem_alloc(size, KM_SLEEP);
- if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
- iflag)) != 0) {
+ if (ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag) != 0) {
vmem_free(packed, size);
return (SET_ERROR(EFAULT));
}
@@ -1407,7 +1396,8 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
* which prevents all inode ops from running.
*/
static int
-zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
+zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp,
+ boolean_t writer)
{
int error = 0;
@@ -1432,7 +1422,7 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
}
static void
-zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag)
{
ZFS_TEARDOWN_EXIT(zfsvfs, tag);
@@ -1470,7 +1460,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
nvlist_t *nvl = NULL;
nvlist_t *hidden_args = NULL;
uint64_t version = SPA_VERSION;
- char *tname;
+ const char *tname;
(void) nvlist_lookup_uint64(props,
zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
@@ -1592,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
nvlist_t *configs;
int error;
- if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
- return (SET_ERROR(EEXIST));
+ error = spa_all_configs(&zc->zc_cookie, &configs);
+ if (error)
+ return (error);
error = put_nvlist(zc, configs);
@@ -1695,6 +1686,47 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc)
return (error);
}
+/*
+ * inputs:
+ * poolname name of the pool
+ * scan_type scan func (pool_scan_func_t)
+ * scan_command scrub pause/resume flag (pool_scrub_cmd_t)
+ */
+static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
+ {"scan_type", DATA_TYPE_UINT64, 0},
+ {"scan_command", DATA_TYPE_UINT64, 0},
+};
+
+static int
+zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+ uint64_t scan_type, scan_cmd;
+
+ if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
+ return (SET_ERROR(EINVAL));
+ if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (scan_cmd >= POOL_SCRUB_FLAGS_END)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+ return (error);
+
+ if (scan_cmd == POOL_SCRUB_PAUSE) {
+ error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
+ } else if (scan_type == POOL_SCAN_NONE) {
+ error = spa_scan_stop(spa);
+ } else {
+ error = spa_scan(spa, scan_type);
+ }
+
+ spa_close(spa, FTAG);
+ return (error);
+}
+
static int
zfs_ioc_pool_freeze(zfs_cmd_t *zc)
{
@@ -1855,7 +1887,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config);
if (error == 0) {
- error = spa_vdev_add(spa, config);
+ error = spa_vdev_add(spa, config, zc->zc_flags);
nvlist_free(config);
}
spa_close(spa, FTAG);
@@ -1921,6 +1953,10 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
break;
+ case VDEV_STATE_REMOVED:
+ error = vdev_remove_wanted(spa, zc->zc_guid);
+ break;
+
default:
error = SET_ERROR(EINVAL);
}
@@ -2044,7 +2080,7 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
dmu_objset_fast_stat(os, &zc->zc_objset_stats);
- if (zc->zc_nvlist_dst != 0 &&
+ if (!zc->zc_simple && zc->zc_nvlist_dst != 0 &&
(error = dsl_prop_get_all(os, &nv)) == 0) {
dmu_objset_stats(os, nv);
/*
@@ -2300,7 +2336,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
}
while (error == 0) {
- if (issig(JUSTLOOKING) && issig(FORREAL)) {
+ if (issig()) {
error = SET_ERROR(EINTR);
break;
}
@@ -2331,6 +2367,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
}
if (zc->zc_simple) {
+ dsl_dataset_fast_stat(ds, &zc->zc_objset_stats);
dsl_dataset_rele(ds, FTAG);
break;
}
@@ -2416,7 +2453,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
const char *strval = NULL;
int err = -1;
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (zfs_prop_userquota(propname))
return (zfs_prop_set_userquota(dsname, pair));
return (-1);
@@ -2486,11 +2523,27 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
case ZFS_PROP_VOLSIZE:
err = zvol_set_volsize(dsname, intval);
break;
- case ZFS_PROP_SNAPDEV:
- err = zvol_set_snapdev(dsname, source, intval);
+ case ZFS_PROP_VOLTHREADING:
+ err = zvol_set_volthreading(dsname, intval);
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
break;
+ case ZFS_PROP_SNAPDEV:
case ZFS_PROP_VOLMODE:
- err = zvol_set_volmode(dsname, source, intval);
+ err = zvol_set_common(dsname, prop, source, intval);
+ break;
+ case ZFS_PROP_READONLY:
+ err = zvol_set_ro(dsname, intval);
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
break;
case ZFS_PROP_VERSION:
{
@@ -2558,6 +2611,7 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
nvpair_t *pair;
nvpair_t *propval;
int rv = 0;
+ int err;
uint64_t intval;
const char *strval;
boolean_t should_update_mount_cache = B_FALSE;
@@ -2569,7 +2623,7 @@ retry:
while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
- int err = 0;
+ err = 0;
/* decode the property value */
propval = pair;
@@ -2586,7 +2640,7 @@ retry:
/* inherited properties are expected to be booleans */
if (nvpair_type(propval) != DATA_TYPE_BOOLEAN)
err = SET_ERROR(EINVAL);
- } else if (err == 0 && prop == ZPROP_INVAL) {
+ } else if (err == 0 && prop == ZPROP_USERPROP) {
if (zfs_prop_user(propname)) {
if (nvpair_type(propval) != DATA_TYPE_STRING)
err = SET_ERROR(EINVAL);
@@ -2668,47 +2722,52 @@ retry:
goto retry;
}
- if (!nvlist_empty(genericnvl) &&
- dsl_props_set(dsname, source, genericnvl) != 0) {
- /*
- * If this fails, we still want to set as many properties as we
- * can, so try setting them individually.
- */
- pair = NULL;
- while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
- const char *propname = nvpair_name(pair);
- int err = 0;
-
- propval = pair;
- if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
- nvlist_t *attrs;
- attrs = fnvpair_value_nvlist(pair);
- propval = fnvlist_lookup_nvpair(attrs,
- ZPROP_VALUE);
- }
+ if (nvlist_empty(genericnvl))
+ goto out;
- if (nvpair_type(propval) == DATA_TYPE_STRING) {
- strval = fnvpair_value_string(propval);
- err = dsl_prop_set_string(dsname, propname,
- source, strval);
- } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) {
- err = dsl_prop_inherit(dsname, propname,
- source);
- } else {
- intval = fnvpair_value_uint64(propval);
- err = dsl_prop_set_int(dsname, propname, source,
- intval);
- }
+ /*
+ * Try to set them all in one batch.
+ */
+ err = dsl_props_set(dsname, source, genericnvl);
+ if (err == 0)
+ goto out;
- if (err != 0) {
- if (errlist != NULL) {
- fnvlist_add_int32(errlist, propname,
- err);
- }
- rv = err;
+ /*
+ * If batching fails, we still want to set as many properties as we
+ * can, so try setting them individually.
+ */
+ pair = NULL;
+ while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+ const char *propname = nvpair_name(pair);
+
+ propval = pair;
+ if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+ nvlist_t *attrs;
+ attrs = fnvpair_value_nvlist(pair);
+ propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
+ }
+
+ if (nvpair_type(propval) == DATA_TYPE_STRING) {
+ strval = fnvpair_value_string(propval);
+ err = dsl_prop_set_string(dsname, propname,
+ source, strval);
+ } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) {
+ err = dsl_prop_inherit(dsname, propname, source);
+ } else {
+ intval = fnvpair_value_uint64(propval);
+ err = dsl_prop_set_int(dsname, propname, source,
+ intval);
+ }
+
+ if (err != 0) {
+ if (errlist != NULL) {
+ fnvlist_add_int32(errlist, propname, err);
}
+ rv = err;
}
}
+
+out:
if (should_update_mount_cache)
zfs_ioctl_update_mount_cache(dsname);
@@ -2856,11 +2915,11 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc)
* and reservation to the received or default values even though
* they are not considered inheritable.
*/
- if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+ if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop))
return (SET_ERROR(EINVAL));
}
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (!zfs_prop_user(propname))
return (SET_ERROR(EINVAL));
@@ -2928,7 +2987,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
mutex_enter(&spa_namespace_lock);
if ((spa = spa_lookup(zc->zc_name)) != NULL) {
spa_configfile_set(spa, props, B_FALSE);
- spa_write_cachefile(spa, B_FALSE, B_TRUE);
+ spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
}
mutex_exit(&spa_namespace_lock);
if (spa != NULL) {
@@ -2982,6 +3041,96 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
}
/*
+ * innvl: {
+ * "vdevprops_set_vdev" -> guid
+ * "vdevprops_set_props" -> { prop -> value }
+ * }
+ *
+ * outnvl: propname -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = {
+ {ZPOOL_VDEV_PROPS_SET_VDEV, DATA_TYPE_UINT64, 0},
+ {ZPOOL_VDEV_PROPS_SET_PROPS, DATA_TYPE_NVLIST, 0}
+};
+
+static int
+zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+ vdev_t *vd;
+ uint64_t vdev_guid;
+
+ /* Early validation */
+ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
+ &vdev_guid) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (outnvl == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+ return (error);
+
+ ASSERT(spa_writeable(spa));
+
+ if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ error = vdev_prop_set(vd, innvl, outnvl);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
+ * innvl: {
+ * "vdevprops_get_vdev" -> guid
+ * (optional) "vdevprops_get_props" -> { propname -> propid }
+ * }
+ *
+ * outnvl: propname -> value
+ */
+static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = {
+ {ZPOOL_VDEV_PROPS_GET_VDEV, DATA_TYPE_UINT64, 0},
+ {ZPOOL_VDEV_PROPS_GET_PROPS, DATA_TYPE_NVLIST, ZK_OPTIONAL}
+};
+
+static int
+zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+ vdev_t *vd;
+ uint64_t vdev_guid;
+
+ /* Early validation */
+ if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
+ &vdev_guid) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (outnvl == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+ return (error);
+
+ if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ error = vdev_prop_get(vd, innvl, outnvl);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+/*
* inputs:
* zc_name name of filesystem
* zc_nvlist_src{_size} nvlist of delegated permissions
@@ -3002,7 +3151,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
/*
* Verify nvlist is constructed correctly
*/
- if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
+ if (zfs_deleg_verify_nvlist(fsaclnv) != 0) {
nvlist_free(fsaclnv);
return (SET_ERROR(EINVAL));
}
@@ -3052,7 +3201,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
return (error);
}
-/* ARGSUSED */
static void
zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
{
@@ -3414,11 +3562,11 @@ static const zfs_ioc_key_t zfs_keys_remap[] = {
/* no nvl keys */
};
-/* ARGSUSED */
static int
zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
/* This IOCTL is no longer supported. */
+ (void) fsname, (void) innvl, (void) outnvl;
return (0);
}
@@ -3506,10 +3654,10 @@ static const zfs_ioc_key_t zfs_keys_log_history[] = {
{"message", DATA_TYPE_STRING, 0},
};
-/* ARGSUSED */
static int
zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) unused, (void) outnvl;
const char *message;
char *poolname;
spa_t *spa;
@@ -3612,10 +3760,10 @@ zfs_unmount_snap(const char *snapname)
(void) zfsctl_snapshot_unmount(snapname, MNT_FORCE);
}
-/* ARGSUSED */
static int
zfs_unmount_snap_cb(const char *snapname, void *arg)
{
+ (void) arg;
zfs_unmount_snap(snapname);
return (0);
}
@@ -3659,7 +3807,6 @@ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
{"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
};
-/* ARGSUSED */
static int
zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -3712,10 +3859,10 @@ static const zfs_ioc_key_t zfs_keys_bookmark[] = {
{"<bookmark>...", DATA_TYPE_STRING, ZK_WILDCARDLIST},
};
-/* ARGSUSED */
static int
zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) poolname;
return (dsl_bookmark_create(innvl, outnvl));
}
@@ -3752,11 +3899,11 @@ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = {
/* no nvl keys */
};
-/* ARGSUSED */
static int
zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl,
nvlist_t *outnvl)
{
+ (void) innvl;
char fsname[ZFS_MAX_DATASET_NAME_LEN];
char *bmname;
@@ -3827,7 +3974,7 @@ static int
zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
{
- char *program;
+ const char *program;
uint64_t instrlimit, memlimit;
boolean_t sync_flag;
nvpair_t *nvarg = NULL;
@@ -3861,10 +4008,10 @@ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
/* no nvl keys */
};
-/* ARGSUSED */
static int
zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) innvl, (void) outnvl;
return (spa_checkpoint(poolname));
}
@@ -3876,11 +4023,11 @@ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
/* no nvl keys */
};
-/* ARGSUSED */
static int
zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
{
+ (void) innvl, (void) outnvl;
return (spa_checkpoint_discard(poolname));
}
@@ -3981,7 +4128,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
cmd_type == POOL_INITIALIZE_START ||
- cmd_type == POOL_INITIALIZE_SUSPEND)) {
+ cmd_type == POOL_INITIALIZE_SUSPEND ||
+ cmd_type == POOL_INITIALIZE_UNINIT)) {
return (SET_ERROR(EINVAL));
}
@@ -4242,13 +4390,12 @@ static const zfs_ioc_key_t zfs_keys_rollback[] = {
{"target", DATA_TYPE_STRING, ZK_OPTIONAL},
};
-/* ARGSUSED */
static int
zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
zfsvfs_t *zfsvfs;
zvol_state_handle_t *zv;
- char *target = NULL;
+ const char *target = NULL;
int error;
(void) nvlist_lookup_string(innvl, "target", &target);
@@ -4314,16 +4461,17 @@ recursive_unmount(const char *fsname, void *arg)
* outnvl is unused
*/
-/* ARGSUSED */
static const zfs_ioc_key_t zfs_keys_redact[] = {
{"bookname", DATA_TYPE_STRING, 0},
{"snapnv", DATA_TYPE_NVLIST, 0},
};
+
static int
zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) outnvl;
nvlist_t *redactnvl = NULL;
- char *redactbook = NULL;
+ const char *redactbook = NULL;
if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0)
return (SET_ERROR(EINVAL));
@@ -4403,7 +4551,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
uint64_t intval, compval;
int err;
- if (prop == ZPROP_INVAL) {
+ if (prop == ZPROP_USERPROP) {
if (zfs_prop_user(propname)) {
if ((err = zfs_secpolicy_write_perms(dsname,
ZFS_DELEG_PERM_USERPROP, cr)))
@@ -4719,10 +4867,10 @@ propval_equals(nvpair_t *p1, nvpair_t *p2)
return (B_FALSE);
if (nvpair_type(p1) == DATA_TYPE_STRING) {
- char *valstr1, *valstr2;
+ const char *valstr1, *valstr2;
- VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
- VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+ VERIFY(nvpair_value_string(p1, &valstr1) == 0);
+ VERIFY(nvpair_value_string(p2, &valstr2) == 0);
return (strcmp(valstr1, valstr2) == 0);
} else {
uint64_t intval1, intval2;
@@ -4787,6 +4935,11 @@ extract_delay_props(nvlist_t *props)
static const zfs_prop_t delayable[] = {
ZFS_PROP_REFQUOTA,
ZFS_PROP_KEYLOCATION,
+ /*
+ * Setting ZFS_PROP_SHARESMB requires the objset type to be
+ * known, which is not possible prior to receipt of raw sends.
+ */
+ ZFS_PROP_SHARESMB,
0
};
int i;
@@ -4838,9 +4991,9 @@ static boolean_t zfs_ioc_recv_inject_err;
* encountered errors, if any. It's the callers responsibility to free.
*/
static int
-zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
- nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force,
- boolean_t resumable, int input_fd,
+zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin,
+ nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args,
+ boolean_t force, boolean_t heal, boolean_t resumable, int input_fd,
dmu_replay_record_t *begin_record, uint64_t *read_bytes,
uint64_t *errflags, nvlist_t **errors)
{
@@ -4850,6 +5003,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
offset_t off, noff;
nvlist_t *local_delayprops = NULL;
nvlist_t *recv_delayprops = NULL;
+ nvlist_t *inherited_delayprops = NULL;
nvlist_t *origprops = NULL; /* existing properties */
nvlist_t *origrecvd = NULL; /* existing received properties */
boolean_t first_recvd_props = B_FALSE;
@@ -4865,7 +5019,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
return (SET_ERROR(EBADF));
noff = off = zfs_file_off(input_fp);
- error = dmu_recv_begin(tofs, tosnap, begin_record, force,
+ error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal,
resumable, localprops, hidden_args, origin, &drc, input_fp,
&off);
if (error != 0)
@@ -4949,7 +5103,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
/* -x property */
const char *name = nvpair_name(nvp);
zfs_prop_t prop = zfs_name_to_prop(name);
- if (prop != ZPROP_INVAL) {
+ if (prop != ZPROP_USERPROP) {
if (!zfs_prop_inheritable(prop))
continue;
} else if (!zfs_prop_user(name))
@@ -4964,6 +5118,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
local_delayprops = extract_delay_props(oprops);
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
oprops, *errors);
+ inherited_delayprops = extract_delay_props(xprops);
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
xprops, *errors);
@@ -5021,6 +5176,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
local_delayprops, *errors);
}
+ if (inherited_delayprops != NULL && error == 0) {
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
+ inherited_delayprops, *errors);
+ }
}
/*
@@ -5040,6 +5199,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0);
nvlist_free(local_delayprops);
}
+ if (inherited_delayprops != NULL) {
+ ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0);
+ nvlist_free(inherited_delayprops);
+ }
*read_bytes = off - noff;
#ifdef ZFS_DEBUG
@@ -5176,15 +5339,16 @@ zfs_ioc_recv(zfs_cmd_t *zc)
nvlist_t *errors = NULL;
nvlist_t *recvdprops = NULL;
nvlist_t *localprops = NULL;
- char *origin = NULL;
+ const char *origin = NULL;
char *tosnap;
char tofs[ZFS_MAX_DATASET_NAME_LEN];
int error = 0;
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
strchr(zc->zc_value, '@') == NULL ||
- strchr(zc->zc_value, '%'))
+ strchr(zc->zc_value, '%') != NULL) {
return (SET_ERROR(EINVAL));
+ }
(void) strlcpy(tofs, zc->zc_value, sizeof (tofs));
tosnap = strchr(tofs, '@');
@@ -5192,13 +5356,15 @@ zfs_ioc_recv(zfs_cmd_t *zc)
if (zc->zc_nvlist_src != 0 &&
(error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
- zc->zc_iflags, &recvdprops)) != 0)
- return (error);
+ zc->zc_iflags, &recvdprops)) != 0) {
+ goto out;
+ }
if (zc->zc_nvlist_conf != 0 &&
(error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- zc->zc_iflags, &localprops)) != 0)
- return (error);
+ zc->zc_iflags, &localprops)) != 0) {
+ goto out;
+ }
if (zc->zc_string[0])
origin = zc->zc_string;
@@ -5208,10 +5374,8 @@ zfs_ioc_recv(zfs_cmd_t *zc)
begin_record.drr_u.drr_begin = zc->zc_begin_record;
error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
- NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record,
+ NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record,
&zc->zc_cookie, &zc->zc_obj, &errors);
- nvlist_free(recvdprops);
- nvlist_free(localprops);
/*
* Now that all props, initial and delayed, are set, report the prop
@@ -5227,7 +5391,10 @@ zfs_ioc_recv(zfs_cmd_t *zc)
error = SET_ERROR(EINVAL);
}
+out:
nvlist_free(errors);
+ nvlist_free(recvdprops);
+ nvlist_free(localprops);
return (error);
}
@@ -5241,6 +5408,7 @@ zfs_ioc_recv(zfs_cmd_t *zc)
* "begin_record" -> non-byteswapped dmu_replay_record_t
* "input_fd" -> file descriptor to read stream from (int32)
* (optional) "force" -> force flag (value ignored)
+ * (optional) "heal" -> use send stream to heal data corruption
* (optional) "resumable" -> resumable flag (value ignored)
* (optional) "cleanup_fd" -> unused
* (optional) "action_handle" -> unused
@@ -5261,6 +5429,7 @@ static const zfs_ioc_key_t zfs_keys_recv_new[] = {
{"begin_record", DATA_TYPE_BYTE_ARRAY, 0},
{"input_fd", DATA_TYPE_INT32, 0},
{"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
{"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
{"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
{"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL},
@@ -5276,11 +5445,12 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
nvlist_t *recvprops = NULL;
nvlist_t *localprops = NULL;
nvlist_t *hidden_args = NULL;
- char *snapname;
- char *origin = NULL;
+ const char *snapname;
+ const char *origin = NULL;
char *tosnap;
char tofs[ZFS_MAX_DATASET_NAME_LEN];
boolean_t force;
+ boolean_t heal;
boolean_t resumable;
uint64_t read_bytes = 0;
uint64_t errflags = 0;
@@ -5291,8 +5461,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
if (dataset_namecheck(snapname, NULL, NULL) != 0 ||
strchr(snapname, '@') == NULL ||
- strchr(snapname, '%'))
+ strchr(snapname, '%') != NULL) {
return (SET_ERROR(EINVAL));
+ }
(void) strlcpy(tofs, snapname, sizeof (tofs));
tosnap = strchr(tofs, '@');
@@ -5310,36 +5481,47 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
input_fd = fnvlist_lookup_int32(innvl, "input_fd");
force = nvlist_exists(innvl, "force");
+ heal = nvlist_exists(innvl, "heal");
resumable = nvlist_exists(innvl, "resumable");
/* we still use "props" here for backwards compatibility */
error = nvlist_lookup_nvlist(innvl, "props", &recvprops);
if (error && error != ENOENT)
- return (error);
+ goto out;
error = nvlist_lookup_nvlist(innvl, "localprops", &localprops);
if (error && error != ENOENT)
- return (error);
+ goto out;
error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
if (error && error != ENOENT)
- return (error);
+ goto out;
error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
- hidden_args, force, resumable, input_fd, begin_record,
+ hidden_args, force, heal, resumable, input_fd, begin_record,
&read_bytes, &errflags, &errors);
fnvlist_add_uint64(outnvl, "read_bytes", read_bytes);
fnvlist_add_uint64(outnvl, "error_flags", errflags);
fnvlist_add_nvlist(outnvl, "errors", errors);
+out:
nvlist_free(errors);
nvlist_free(recvprops);
nvlist_free(localprops);
+ nvlist_free(hidden_args);
return (error);
}
+/*
+ * When stack space is limited, we write replication stream data to the target
+ * on a separate taskq thread, to make sure there's enough stack space.
+ */
+#ifndef HAVE_LARGE_STACKS
+#define USE_SEND_TASKQ 1
+#endif
+
typedef struct dump_bytes_io {
zfs_file_t *dbi_fp;
caddr_t dbi_buf;
@@ -5360,31 +5542,65 @@ dump_bytes_cb(void *arg)
dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL);
}
+typedef struct dump_bytes_arg {
+ zfs_file_t *dba_fp;
+#ifdef USE_SEND_TASKQ
+ taskq_t *dba_tq;
+ taskq_ent_t dba_tqent;
+#endif
+} dump_bytes_arg_t;
+
static int
dump_bytes(objset_t *os, void *buf, int len, void *arg)
{
+ dump_bytes_arg_t *dba = (dump_bytes_arg_t *)arg;
dump_bytes_io_t dbi;
- dbi.dbi_fp = arg;
+ dbi.dbi_fp = dba->dba_fp;
dbi.dbi_buf = buf;
dbi.dbi_len = len;
-#if defined(HAVE_LARGE_STACKS)
- dump_bytes_cb(&dbi);
+#ifdef USE_SEND_TASKQ
+ taskq_dispatch_ent(dba->dba_tq, dump_bytes_cb, &dbi, TQ_SLEEP,
+ &dba->dba_tqent);
+ taskq_wait(dba->dba_tq);
#else
- /*
- * The vn_rdwr() call is performed in a taskq to ensure that there is
- * always enough stack space to write safely to the target filesystem.
- * The ZIO_TYPE_FREE threads are used because there can be a lot of
- * them and they are used in vdev_file.c for a similar purpose.
- */
- spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE,
- ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
-#endif /* HAVE_LARGE_STACKS */
+ dump_bytes_cb(&dbi);
+#endif
return (dbi.dbi_err);
}
+static int
+dump_bytes_init(dump_bytes_arg_t *dba, int fd, dmu_send_outparams_t *out)
+{
+ zfs_file_t *fp = zfs_file_get(fd);
+ if (fp == NULL)
+ return (SET_ERROR(EBADF));
+
+ dba->dba_fp = fp;
+#ifdef USE_SEND_TASKQ
+ dba->dba_tq = taskq_create("z_send", 1, defclsyspri, 0, 0, 0);
+ taskq_init_ent(&dba->dba_tqent);
+#endif
+
+ memset(out, 0, sizeof (dmu_send_outparams_t));
+ out->dso_outfunc = dump_bytes;
+ out->dso_arg = dba;
+ out->dso_dryrun = B_FALSE;
+
+ return (0);
+}
+
+static void
+dump_bytes_fini(dump_bytes_arg_t *dba)
+{
+ zfs_file_put(dba->dba_fp);
+#ifdef USE_SEND_TASKQ
+ taskq_destroy(dba->dba_tq);
+#endif
+}
+
/*
* inputs:
* zc_name name of snapshot to send
@@ -5469,21 +5685,18 @@ zfs_ioc_send(zfs_cmd_t *zc)
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
} else {
- zfs_file_t *fp;
- dmu_send_outparams_t out = {0};
-
- if ((fp = zfs_file_get(zc->zc_cookie)) == NULL)
- return (SET_ERROR(EBADF));
+ dump_bytes_arg_t dba;
+ dmu_send_outparams_t out;
+ error = dump_bytes_init(&dba, zc->zc_cookie, &out);
+ if (error)
+ return (error);
- off = zfs_file_off(fp);
- out.dso_outfunc = dump_bytes;
- out.dso_arg = fp;
- out.dso_dryrun = B_FALSE;
+ off = zfs_file_off(dba.dba_fp);
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
zc->zc_fromobj, embedok, large_block_ok, compressok,
rawok, savedok, zc->zc_cookie, &off, &out);
- zfs_file_put(fp);
+ dump_bytes_fini(&dba);
}
return (error);
}
@@ -5585,17 +5798,12 @@ zfs_ioc_error_log(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
- size_t count = (size_t)zc->zc_nvlist_dst_size;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
- &count);
- if (error == 0)
- zc->zc_nvlist_dst_size = count;
- else
- zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+ &zc->zc_nvlist_dst_size);
spa_close(spa, FTAG);
@@ -5654,10 +5862,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* If multihost is enabled, resuming I/O is unsafe as another
- * host may have imported the pool.
+ * host may have imported the pool. Check for remote activity.
*/
- if (spa_multihost(spa) && spa_suspended(spa))
- return (SET_ERROR(EINVAL));
+ if (spa_multihost(spa) && spa_suspended(spa) &&
+ spa_mmp_remote_host_activity(spa)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EREMOTEIO));
+ }
spa_vdev_state_enter(spa, SCL_NONE);
@@ -5703,10 +5914,10 @@ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
{"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
};
-/* ARGSUSED */
static int
zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) outnvl;
spa_t *spa;
int error;
boolean_t rc, scrub_restart = B_TRUE;
@@ -6005,10 +6216,6 @@ zfs_ioc_share(zfs_cmd_t *zc)
return (SET_ERROR(ENOSYS));
}
-ace_t full_access[] = {
- {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
-};
-
/*
* inputs:
* zc_name name of containing filesystem
@@ -6117,10 +6324,10 @@ static const zfs_ioc_key_t zfs_keys_hold[] = {
{"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
};
-/* ARGSUSED */
static int
zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
{
+ (void) pool;
nvpair_t *pair;
nvlist_t *holds;
int cleanup_fd = -1;
@@ -6133,7 +6340,7 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
/* make sure the user didn't pass us any invalid (empty) tags */
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
- char *htag;
+ const char *htag;
error = nvpair_value_string(pair, &htag);
if (error != 0)
@@ -6169,10 +6376,10 @@ static const zfs_ioc_key_t zfs_keys_get_holds[] = {
/* no nvl keys */
};
-/* ARGSUSED */
static int
zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
{
+ (void) args;
return (dsl_dataset_get_holds(snapname, outnvl));
}
@@ -6191,10 +6398,10 @@ static const zfs_ioc_key_t zfs_keys_release[] = {
{"<snapname>...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST},
};
-/* ARGSUSED */
static int
zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
{
+ (void) pool;
return (dsl_dataset_user_release(holds, errlist));
}
@@ -6252,7 +6459,7 @@ zfs_ioc_events_next(zfs_cmd_t *zc)
static int
zfs_ioc_events_clear(zfs_cmd_t *zc)
{
- int count;
+ uint_t count;
zfs_zevent_drain_all(&count);
zc->zc_cookie = count;
@@ -6353,7 +6560,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
int error;
dsl_pool_t *dp;
dsl_dataset_t *new, *old;
- char *firstsnap;
+ const char *firstsnap;
uint64_t used, comp, uncomp;
firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
@@ -6428,15 +6635,14 @@ static const zfs_ioc_key_t zfs_keys_send_new[] = {
{"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL},
};
-/* ARGSUSED */
static int
zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) outnvl;
int error;
offset_t off;
- char *fromname = NULL;
+ const char *fromname = NULL;
int fd;
- zfs_file_t *fp;
boolean_t largeblockok;
boolean_t embedok;
boolean_t compressok;
@@ -6444,7 +6650,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
boolean_t savedok;
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
- char *redactbook = NULL;
+ const char *redactbook = NULL;
fd = fnvlist_lookup_int32(innvl, "fd");
@@ -6461,28 +6667,28 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
(void) nvlist_lookup_string(innvl, "redactbook", &redactbook);
- if ((fp = zfs_file_get(fd)) == NULL)
- return (SET_ERROR(EBADF));
-
- off = zfs_file_off(fp);
+ dump_bytes_arg_t dba;
+ dmu_send_outparams_t out;
+ error = dump_bytes_init(&dba, fd, &out);
+ if (error)
+ return (error);
- dmu_send_outparams_t out = {0};
- out.dso_outfunc = dump_bytes;
- out.dso_arg = fp;
- out.dso_dryrun = B_FALSE;
+ off = zfs_file_off(dba.dba_fp);
error = dmu_send(snapname, fromname, embedok, largeblockok,
compressok, rawok, savedok, resumeobj, resumeoff,
redactbook, fd, &off, &out);
- zfs_file_put(fp);
+ dump_bytes_fini(&dba);
+
return (error);
}
-/* ARGSUSED */
static int
send_space_sum(objset_t *os, void *buf, int len, void *arg)
{
+ (void) os, (void) buf;
uint64_t *size = arg;
+
*size += len;
return (0);
}
@@ -6533,8 +6739,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
dsl_dataset_t *tosnap;
dsl_dataset_t *fromsnap = NULL;
int error;
- char *fromname = NULL;
- char *redactlist_book = NULL;
+ const char *fromname = NULL;
+ const char *redactlist_book = NULL;
boolean_t largeblockok;
boolean_t embedok;
boolean_t compressok;
@@ -6674,10 +6880,10 @@ static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
{"force", DATA_TYPE_BOOLEAN_VALUE, 0},
};
-/* ARGSUSED */
static int
zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
{
+ (void) onvl;
int err;
boolean_t rc, force = B_FALSE;
spa_t *spa;
@@ -6717,10 +6923,10 @@ static const zfs_ioc_key_t zfs_keys_load_key[] = {
{"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
};
-/* ARGSUSED */
static int
zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) outnvl;
int ret;
dsl_crypto_params_t *dcp = NULL;
nvlist_t *hidden_args;
@@ -6759,10 +6965,10 @@ static const zfs_ioc_key_t zfs_keys_unload_key[] = {
/* no nvl keys */
};
-/* ARGSUSED */
static int
zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) innvl, (void) outnvl;
int ret = 0;
if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
@@ -6780,7 +6986,7 @@ out:
/*
* Changes a user's wrapping key used to decrypt a dataset. The keyformat,
- * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified
+ * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified
* here to change how the key is derived in userspace.
*
* innvl: {
@@ -6797,10 +7003,10 @@ static const zfs_ioc_key_t zfs_keys_change_key[] = {
{"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
};
-/* ARGSUSED */
static int
zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ (void) outnvl;
int ret;
uint64_t cmd = DCP_CMD_NONE;
dsl_crypto_params_t *dcp = NULL;
@@ -7029,7 +7235,7 @@ zfs_ioctl_init(void)
ARRAY_SIZE(zfs_keys_destroy_bookmarks));
zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW,
- zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME,
+ zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new));
zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
@@ -7107,6 +7313,21 @@ zfs_ioctl_init(void)
POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
+ zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS,
+ zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME,
+ POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props,
+ ARRAY_SIZE(zfs_keys_vdev_get_props));
+
+ zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS,
+ zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props));
+
+ zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
+ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_NONE, B_TRUE, B_TRUE,
+ zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
+
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
@@ -7268,7 +7489,7 @@ zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
*/
for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
- char *name = nvpair_name(pair);
+ const char *name = nvpair_name(pair);
data_type_t type = nvpair_type(pair);
boolean_t identified = B_FALSE;
@@ -7359,7 +7580,7 @@ zfsdev_getminor(zfs_file_t *fp, minor_t *minorp)
mutex_enter(&zfsdev_state_lock);
- for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
if (zs->zs_minor == -1)
continue;
@@ -7381,9 +7602,9 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which)
{
zfsdev_state_t *zs;
- for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
if (zs->zs_minor == minor) {
- smp_rmb();
+ membar_consumer();
switch (which) {
case ZST_ONEXIT:
return (zs->zs_onexit);
@@ -7435,7 +7656,7 @@ zfsdev_state_init(void *priv)
if (minor == 0)
return (SET_ERROR(ENXIO));
- for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+ for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
if (zs->zs_minor == -1)
break;
zsprev = zs;
@@ -7719,13 +7940,11 @@ zfs_kmod_init(void)
zfs_ioctl_init();
mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
- zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
- zfsdev_state_list->zs_minor = -1;
+ zfsdev_state_listhead.zs_minor = -1;
if ((error = zfsdev_attach()) != 0)
goto out;
- tsd_create(&zfs_fsyncer_key, NULL);
tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
@@ -7747,13 +7966,14 @@ zfs_kmod_fini(void)
mutex_destroy(&zfsdev_state_lock);
- for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) {
+ for (zs = &zfsdev_state_listhead; zs != NULL; zs = zsnext) {
zsnext = zs->zs_next;
if (zs->zs_onexit)
zfs_onexit_destroy(zs->zs_onexit);
if (zs->zs_zevent)
zfs_zevent_destroy(zs->zs_zevent);
- kmem_free(zs, sizeof (zfsdev_state_t));
+ if (zs != &zfsdev_state_listhead)
+ kmem_free(zs, sizeof (zfsdev_state_t));
}
zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */
@@ -7761,15 +7981,12 @@ zfs_kmod_fini(void)
spa_fini();
zvol_fini();
- tsd_destroy(&zfs_fsyncer_key);
tsd_destroy(&rrw_tsd_key);
tsd_destroy(&zfs_allow_log_key);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW,
- "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
+ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW,
+ "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
-ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW,
- "Maximum size in bytes of ZFS ioctl output that will be logged");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW,
+ "Maximum size in bytes of ZFS ioctl output that will be logged");
diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c
index e248dc3cc4e8..fa4e7093ca46 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_log.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_log.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
*/
@@ -107,86 +108,81 @@ zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
static void
zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
{
- uint32_t *bitmap;
- uint64_t *attrs;
- uint64_t *crtime;
- xoptattr_t *xoap;
- void *scanstamp;
- int i;
+ xoptattr_t *xoap;
xoap = xva_getxoptattr(xvap);
ASSERT(xoap);
lrattr->lr_attr_masksize = xvap->xva_mapsize;
- bitmap = &lrattr->lr_attr_bitmap;
- for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
+ uint32_t *bitmap = &lrattr->lr_attr_bitmap;
+ for (int i = 0; i != xvap->xva_mapsize; i++, bitmap++)
*bitmap = xvap->xva_reqattrmap[i];
- }
- /* Now pack the attributes up in a single uint64_t */
- attrs = (uint64_t *)bitmap;
- *attrs = 0;
- crtime = attrs + 1;
- bzero(crtime, 2 * sizeof (uint64_t));
- scanstamp = (caddr_t)(crtime + 2);
- bzero(scanstamp, AV_SCANSTAMP_SZ);
+ lr_attr_end_t *end = (lr_attr_end_t *)bitmap;
+ end->lr_attr_attrs = 0;
+ end->lr_attr_crtime[0] = 0;
+ end->lr_attr_crtime[1] = 0;
+ memset(end->lr_attr_scanstamp, 0, AV_SCANSTAMP_SZ);
+
if (XVA_ISSET_REQ(xvap, XAT_READONLY))
- *attrs |= (xoap->xoa_readonly == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_readonly == 0) ? 0 :
XAT0_READONLY;
if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
- *attrs |= (xoap->xoa_hidden == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_hidden == 0) ? 0 :
XAT0_HIDDEN;
if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
- *attrs |= (xoap->xoa_system == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_system == 0) ? 0 :
XAT0_SYSTEM;
if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
- *attrs |= (xoap->xoa_archive == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_archive == 0) ? 0 :
XAT0_ARCHIVE;
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
- *attrs |= (xoap->xoa_immutable == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_immutable == 0) ? 0 :
XAT0_IMMUTABLE;
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
- *attrs |= (xoap->xoa_nounlink == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_nounlink == 0) ? 0 :
XAT0_NOUNLINK;
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
- *attrs |= (xoap->xoa_appendonly == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_appendonly == 0) ? 0 :
XAT0_APPENDONLY;
if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
- *attrs |= (xoap->xoa_opaque == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_opaque == 0) ? 0 :
XAT0_APPENDONLY;
if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
- *attrs |= (xoap->xoa_nodump == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_nodump == 0) ? 0 :
XAT0_NODUMP;
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
- *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
XAT0_AV_QUARANTINED;
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
- *attrs |= (xoap->xoa_av_modified == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_av_modified == 0) ? 0 :
XAT0_AV_MODIFIED;
if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
- ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, end->lr_attr_crtime);
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
- bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+ memcpy(end->lr_attr_scanstamp, xoap->xoa_av_scanstamp,
+ AV_SCANSTAMP_SZ);
} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
/*
* XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
* at the same time, so we can share the same space.
*/
- bcopy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
+ memcpy(end->lr_attr_scanstamp, &xoap->xoa_projid,
+ sizeof (uint64_t));
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
- *attrs |= (xoap->xoa_reparse == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_reparse == 0) ? 0 :
XAT0_REPARSE;
if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
- *attrs |= (xoap->xoa_offline == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_offline == 0) ? 0 :
XAT0_OFFLINE;
if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
- *attrs |= (xoap->xoa_sparse == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_sparse == 0) ? 0 :
XAT0_SPARSE;
if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
- *attrs |= (xoap->xoa_projinherit == 0) ? 0 :
+ end->lr_attr_attrs |= (xoap->xoa_projinherit == 0) ? 0 :
XAT0_PROJINHERIT;
}
@@ -214,7 +210,7 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
if (fuidp->z_domain_str_sz != 0) {
for (zdomain = list_head(&fuidp->z_domains); zdomain;
zdomain = list_next(&fuidp->z_domains, zdomain)) {
- bcopy((void *)zdomain->z_domain, start,
+ memcpy(start, zdomain->z_domain,
strlen(zdomain->z_domain) + 1);
start = (caddr_t)start +
strlen(zdomain->z_domain) + 1;
@@ -392,7 +388,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
else
lracl->lr_acl_flags = 0;
- bcopy(vsecp->vsa_aclentp, end, aclsize);
+ memcpy(end, vsecp->vsa_aclentp, aclsize);
end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
}
@@ -404,7 +400,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
/*
* Now place file name in log record
*/
- bcopy(name, end, namesize);
+ memcpy(end, name, namesize);
zil_itx_assign(zilog, itx, tx);
}
@@ -426,7 +422,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_remove_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
- bcopy(name, (char *)(lr + 1), namesize);
+ memcpy(lr + 1, name, namesize);
itx->itx_oid = foid;
@@ -462,7 +458,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr = (lr_link_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_link_obj = zp->z_id;
- bcopy(name, (char *)(lr + 1), namesize);
+ memcpy(lr + 1, name, namesize);
zil_itx_assign(zilog, itx, tx);
}
@@ -493,8 +489,31 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
sizeof (uint64_t));
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
lr->lr_crtime, sizeof (uint64_t) * 2);
- bcopy(name, (char *)(lr + 1), namesize);
- bcopy(link, (char *)(lr + 1) + namesize, linksize);
+ memcpy((char *)(lr + 1), name, namesize);
+ memcpy((char *)(lr + 1) + namesize, link, linksize);
+
+ zil_itx_assign(zilog, itx, tx);
+}
+
+static void
+do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
+ const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
+{
+ itx_t *itx;
+ lr_rename_t *lr;
+ size_t snamesize = strlen(sname) + 1;
+ size_t dnamesize = strlen(dname) + 1;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+ lr = (lr_rename_t *)&itx->itx_lr;
+ lr->lr_sdoid = sdzp->z_id;
+ lr->lr_tdoid = tdzp->z_id;
+ memcpy((char *)(lr + 1), sname, snamesize);
+ memcpy((char *)(lr + 1) + snamesize, dname, dnamesize);
+ itx->itx_oid = szp->z_id;
zil_itx_assign(zilog, itx, tx);
}
@@ -506,20 +525,73 @@ void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
{
+ txtype |= TX_RENAME;
+ do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
+}
+
+/*
+ * Handles TX_RENAME_EXCHANGE transactions.
+ */
+void
+zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
+ znode_t *szp)
+{
+ txtype |= TX_RENAME_EXCHANGE;
+ do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
+}
+
+/*
+ * Handles TX_RENAME_WHITEOUT transactions.
+ *
+ * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call
+ * zfs_mknode() on replay which requires stashing bits as with TX_CREATE.
+ */
+void
+zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+ znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
+ znode_t *szp, znode_t *wzp)
+{
itx_t *itx;
- lr_rename_t *lr;
+ lr_rename_whiteout_t *lr;
size_t snamesize = strlen(sname) + 1;
size_t dnamesize = strlen(dname) + 1;
if (zil_replaying(zilog, tx))
return;
+ txtype |= TX_RENAME_WHITEOUT;
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
- lr = (lr_rename_t *)&itx->itx_lr;
- lr->lr_sdoid = sdzp->z_id;
- lr->lr_tdoid = tdzp->z_id;
- bcopy(sname, (char *)(lr + 1), snamesize);
- bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+ lr = (lr_rename_whiteout_t *)&itx->itx_lr;
+ lr->lr_rename.lr_sdoid = sdzp->z_id;
+ lr->lr_rename.lr_tdoid = tdzp->z_id;
+
+ /*
+ * RENAME_WHITEOUT will create an entry at the source znode, so we need
+ * to store the same data that the equivalent call to zfs_log_create()
+ * would.
+ */
+ lr->lr_wfoid = wzp->z_id;
+ LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT);
+ (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen,
+ sizeof (uint64_t));
+ (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)),
+ lr->lr_wcrtime, sizeof (uint64_t) * 2);
+ lr->lr_wmode = wzp->z_mode;
+ lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp));
+ lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp));
+
+ /*
+ * This rdev will always be makdevice(0, 0) but because the ZIL log and
+ * replay code needs to be platform independent (and there is no
+ * platform independent makdev()) we need to copy the one created
+ * during the rename operation.
+ */
+ (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev,
+ sizeof (lr->lr_wrdev));
+
+ memcpy((char *)(lr + 1), sname, snamesize);
+ memcpy((char *)(lr + 1) + snamesize, dname, dnamesize);
itx->itx_oid = szp->z_id;
zil_itx_assign(zilog, itx, tx);
@@ -530,17 +602,16 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
* called as soon as the write is on stable storage (be it via a DMU sync or a
* ZIL commit).
*/
-long zfs_immediate_write_sz = 32768;
+static int64_t zfs_immediate_write_sz = 32768;
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t resid, int ioflag,
+ znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
zil_callback_t callback, void *callback_data)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
uint32_t blocksize = zp->z_blksz;
itx_wr_state_t write_state;
- uintptr_t fsync_cnt;
uint64_t gen = 0;
ssize_t size = resid;
@@ -556,15 +627,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
else if (!spa_has_slogs(zilog->zl_spa) &&
resid >= zfs_immediate_write_sz)
write_state = WR_INDIRECT;
- else if (ioflag & (O_SYNC | O_DSYNC))
+ else if (commit)
write_state = WR_COPIED;
else
write_state = WR_NEED_COPY;
- if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
- (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
- }
-
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen,
sizeof (gen));
@@ -615,12 +682,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
BP_ZERO(&lr->lr_blkptr);
itx->itx_private = ZTOZSB(zp);
+ itx->itx_sync = (zp->z_sync_cnt != 0);
itx->itx_gen = gen;
- if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) &&
- (fsync_cnt == 0))
- itx->itx_sync = B_FALSE;
-
itx->itx_callback = callback;
itx->itx_callback_data = callback_data;
zil_itx_assign(zilog, itx, tx);
@@ -721,6 +785,40 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
}
/*
+ * Handles TX_SETSAXATTR transactions.
+ */
+void
+zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+ znode_t *zp, const char *name, const void *value, size_t size)
+{
+ itx_t *itx;
+ lr_setsaxattr_t *lr;
+ size_t recsize = sizeof (lr_setsaxattr_t);
+ void *xattrstart;
+ int namelen;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ namelen = strlen(name) + 1;
+ recsize += (namelen + size);
+ itx = zil_itx_create(txtype, recsize);
+ lr = (lr_setsaxattr_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ xattrstart = (char *)(lr + 1);
+ memcpy(xattrstart, name, namelen);
+ if (value != NULL) {
+ memcpy((char *)xattrstart + namelen, value, size);
+ lr->lr_size = size;
+ } else {
+ lr->lr_size = 0;
+ }
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+/*
* Handles TX_ACL transactions.
*/
void
@@ -768,11 +866,11 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
if (txtype == TX_ACL_V0) {
lrv0 = (lr_acl_v0_t *)lr;
- bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
+ memcpy(lrv0 + 1, vsecp->vsa_aclentp, aclbytes);
} else {
void *start = (ace_t *)(lr + 1);
- bcopy(vsecp->vsa_aclentp, start, aclbytes);
+ memcpy(start, vsecp->vsa_aclentp, aclbytes);
start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
@@ -786,7 +884,52 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
zil_itx_assign(zilog, itx, tx);
}
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW,
+/*
+ * Handles TX_CLONE_RANGE transactions.
+ */
+void
+zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
+ uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
+ size_t nbps)
+{
+ itx_t *itx;
+ lr_clone_range_t *lr;
+ uint64_t partlen, max_log_data;
+ size_t partnbps;
+
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
+ return;
+
+ max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
+
+ while (nbps > 0) {
+ partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
+ partlen = partnbps * blksz;
+ ASSERT3U(partlen, <, len + blksz);
+ partlen = MIN(partlen, len);
+
+ itx = zil_itx_create(txtype,
+ sizeof (*lr) + sizeof (bps[0]) * partnbps);
+ lr = (lr_clone_range_t *)&itx->itx_lr;
+ lr->lr_foid = zp->z_id;
+ lr->lr_offset = off;
+ lr->lr_length = partlen;
+ lr->lr_blksz = blksz;
+ lr->lr_nbps = partnbps;
+ memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
+
+ itx->itx_sync = (zp->z_sync_cnt != 0);
+
+ zil_itx_assign(zilog, itx, tx);
+
+ bps += partnbps;
+ ASSERT3U(nbps, >=, partnbps);
+ nbps -= partnbps;
+ off += partlen;
+ ASSERT3U(len, >=, partlen);
+ len -= partlen;
+ }
+}
+
+ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW,
"Largest data block to write to zil");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
index 7c56dd9c97f5..7bf804b67790 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_onexit.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -87,8 +87,7 @@ zfs_onexit_destroy(zfs_onexit_t *zo)
zfs_onexit_action_node_t *ap;
mutex_enter(&zo->zo_lock);
- while ((ap = list_head(&zo->zo_actions)) != NULL) {
- list_remove(&zo->zo_actions, ap);
+ while ((ap = list_remove_head(&zo->zo_actions)) != NULL) {
mutex_exit(&zo->zo_lock);
ap->za_func(ap->za_data);
kmem_free(ap, sizeof (zfs_onexit_action_node_t));
@@ -151,7 +150,7 @@ zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
*/
int
zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
- uint64_t *action_handle)
+ uintptr_t *action_handle)
{
zfs_onexit_t *zo;
zfs_onexit_action_node_t *ap;
@@ -170,7 +169,7 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
list_insert_tail(&zo->zo_actions, ap);
mutex_exit(&zo->zo_lock);
if (action_handle)
- *action_handle = (uint64_t)(uintptr_t)ap;
+ *action_handle = (uintptr_t)ap;
return (0);
}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c
index e61db5c7ab83..9b351eefc04e 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_quota.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,8 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek
* Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
index b18b480ce527..091562ca6852 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c
index e6ed3e738e40..2e0af60f6db4 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_replay.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 Cyril Plisko. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/types.h>
@@ -47,6 +48,8 @@
#include <sys/atomic.h>
#include <sys/cred.h>
#include <sys/zpl.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfeature.h>
/*
* NB: FreeBSD expects to be able to do vnode locking in lookup and
@@ -68,7 +71,7 @@ static void
zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
{
- bzero(vap, sizeof (*vap));
+ memset(vap, 0, sizeof (*vap));
vap->va_mask = (uint_t)mask;
vap->va_mode = mode;
#if defined(__FreeBSD__) || defined(__APPLE__)
@@ -80,10 +83,10 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
vap->va_nodeid = nodeid;
}
-/* ARGSUSED */
static int
zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
{
+ (void) arg1, (void) arg2, (void) byteswap;
return (SET_ERROR(ENOTSUP));
}
@@ -141,13 +144,13 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
- bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+ memcpy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
/*
* XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
* at the same time, so we can share the same space.
*/
- bcopy(scanstamp, &xoap->xoa_projid, sizeof (uint64_t));
+ memcpy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
@@ -306,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
uint64_t dnodesize;
int error;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl));
+
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
if (byteswap) {
byteswap_uint64_array(lracl, sizeof (*lracl));
@@ -362,7 +367,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
lr->lr_uid, lr->lr_gid);
- fallthrough;
+ zfs_fallthrough;
case TX_CREATE_ACL_ATTR:
if (name == NULL) {
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
@@ -384,8 +389,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
lr->lr_uid, lr->lr_gid);
}
+#if defined(__linux__)
+ error = zfs_create(dzp, name, &xva.xva_vattr,
+ 0, 0, &zp, kcred, vflg, &vsec, zfs_init_idmap);
+#else
error = zfs_create(dzp, name, &xva.xva_vattr,
- 0, 0, &zp, kcred, vflg, &vsec);
+ 0, 0, &zp, kcred, vflg, &vsec, NULL);
+#endif
break;
case TX_MKDIR_ACL:
aclstart = (caddr_t)(lracl + 1);
@@ -394,7 +404,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
lr->lr_uid, lr->lr_gid);
- fallthrough;
+ zfs_fallthrough;
case TX_MKDIR_ACL_ATTR:
if (name == NULL) {
lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
@@ -414,8 +424,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
(void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
lr->lr_uid, lr->lr_gid);
}
+#if defined(__linux__)
error = zfs_mkdir(dzp, name, &xva.xva_vattr,
- &zp, kcred, vflg, &vsec);
+ &zp, kcred, vflg, &vsec, zfs_init_idmap);
+#else
+ error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+ &zp, kcred, vflg, &vsec, NULL);
+#endif
break;
default:
error = SET_ERROR(ENOTSUP);
@@ -457,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
uint64_t dnodesize;
int error;
+ ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
@@ -500,9 +517,9 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
*
* The _ATTR versions will grab the fuid info in their subcases.
*/
- if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
- (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
- (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+ if (txtype != TX_SYMLINK &&
+ txtype != TX_MKDIR_ATTR &&
+ txtype != TX_CREATE_ATTR) {
start = (lr + 1);
zfsvfs->z_fuid_replay =
zfs_replay_fuid_domain(start, &start,
@@ -519,14 +536,19 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
zfs_replay_fuid_domain(start, &start,
lr->lr_uid, lr->lr_gid);
name = (char *)start;
- fallthrough;
+ zfs_fallthrough;
case TX_CREATE:
if (name == NULL)
name = (char *)start;
+#if defined(__linux__)
+ error = zfs_create(dzp, name, &xva.xva_vattr,
+ 0, 0, &zp, kcred, vflg, NULL, zfs_init_idmap);
+#else
error = zfs_create(dzp, name, &xva.xva_vattr,
- 0, 0, &zp, kcred, vflg, NULL);
+ 0, 0, &zp, kcred, vflg, NULL, NULL);
+#endif
break;
case TX_MKDIR_ATTR:
lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
@@ -537,14 +559,20 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
zfs_replay_fuid_domain(start, &start,
lr->lr_uid, lr->lr_gid);
name = (char *)start;
- fallthrough;
+ zfs_fallthrough;
case TX_MKDIR:
if (name == NULL)
name = (char *)(lr + 1);
+#if defined(__linux__)
error = zfs_mkdir(dzp, name, &xva.xva_vattr,
- &zp, kcred, vflg, NULL);
+ &zp, kcred, vflg, NULL, zfs_init_idmap);
+#else
+ error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+ &zp, kcred, vflg, NULL, NULL);
+#endif
+
break;
case TX_MKXATTR:
error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred);
@@ -552,8 +580,13 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
case TX_SYMLINK:
name = (char *)(lr + 1);
link = name + strlen(name) + 1;
+#if defined(__linux__)
+ error = zfs_symlink(dzp, name, &xva.xva_vattr,
+ link, &zp, kcred, vflg, zfs_init_idmap);
+#else
error = zfs_symlink(dzp, name, &xva.xva_vattr,
- link, &zp, kcred, vflg);
+ link, &zp, kcred, vflg, NULL);
+#endif
break;
default:
error = SET_ERROR(ENOTSUP);
@@ -584,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
int error;
int vflg = 0;
+ ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -619,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
int error;
int vflg = 0;
+ ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -641,18 +678,21 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
}
static int
-zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname,
+ char *tname, uint64_t rflags, vattr_t *wo_vap)
{
- zfsvfs_t *zfsvfs = arg1;
- lr_rename_t *lr = arg2;
- char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
- char *tname = sname + strlen(sname) + 1;
znode_t *sdzp, *tdzp;
- int error;
- int vflg = 0;
+ int error, vflg = 0;
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
+ /* Only Linux currently supports RENAME_* flags. */
+#ifdef __linux__
+ VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT));
+
+ /* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */
+ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
+#else
+ VERIFY0(rflags);
+#endif
if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
return (error);
@@ -665,7 +705,13 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
if (lr->lr_common.lrc_txtype & TX_CI)
vflg |= FIGNORECASE;
- error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg);
+#if defined(__linux__)
+ error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
+ wo_vap, zfs_init_idmap);
+#else
+ error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
+ wo_vap, NULL);
+#endif
zrele(tdzp);
zrele(sdzp);
@@ -673,6 +719,92 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
}
static int
+zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_rename_t *lr = arg2;
+
+ ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+ char *tname = sname + strlen(sname) + 1;
+ return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
+}
+
+static int
+zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
+{
+#ifdef __linux__
+ zfsvfs_t *zfsvfs = arg1;
+ lr_rename_t *lr = arg2;
+
+ ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
+ char *tname = sname + strlen(sname) + 1;
+ return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
+ NULL));
+#else
+ return (SET_ERROR(ENOTSUP));
+#endif
+}
+
+static int
+zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
+{
+#ifdef __linux__
+ zfsvfs_t *zfsvfs = arg1;
+ lr_rename_whiteout_t *lr = arg2;
+ int error;
+ /* For the whiteout file. */
+ xvattr_t xva;
+ uint64_t objid;
+ uint64_t dnodesize;
+
+ ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr));
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ objid = LR_FOID_GET_OBJ(lr->lr_wfoid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT;
+
+ xva_init(&xva);
+ zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
+ lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid);
+
+ /*
+ * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which
+ * assigns the object's creation time, generation number, and dnode
+ * slot count. The generic zfs_rename() has no concept of these
+ * attributes, so we smuggle the values inside the vattr's otherwise
+ * unused va_ctime, va_nblocks, and va_fsid fields.
+ */
+ ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime);
+ xva.xva_vattr.va_nblocks = lr->lr_wgen;
+ xva.xva_vattr.va_fsid = dnodesize;
+
+ error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+ if (error)
+ return (error);
+
+ /* sname and tname follow lr_rename_whiteout_t */
+ char *sname = (char *)(lr + 1);
+ char *tname = sname + strlen(sname) + 1;
+ return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
+ RENAME_WHITEOUT, &xva.xva_vattr));
+#else
+ return (SET_ERROR(ENOTSUP));
+#endif
+}
+
+static int
zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
zfsvfs_t *zfsvfs = arg1;
@@ -682,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
int error;
uint64_t eod, offset, length;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -745,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
int error;
uint64_t end;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -789,16 +925,17 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
zfsvfs_t *zfsvfs = arg1;
lr_truncate_t *lr = arg2;
znode_t *zp;
- flock64_t fl;
+ flock64_t fl = {0};
int error;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- bzero(&fl, sizeof (fl));
fl.l_type = F_WRLCK;
fl.l_whence = SEEK_SET;
fl.l_start = lr->lr_offset;
@@ -823,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
int error;
void *start;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
xva_init(&xva);
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
@@ -859,7 +998,11 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
lr->lr_uid, lr->lr_gid);
- error = zfs_setattr(zp, vap, 0, kcred);
+#if defined(__linux__)
+ error = zfs_setattr(zp, vap, 0, kcred, zfs_init_idmap);
+#else
+ error = zfs_setattr(zp, vap, 0, kcred, NULL);
+#endif
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
zfsvfs->z_fuid_replay = NULL;
@@ -869,15 +1012,102 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
}
static int
+zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_setsaxattr_t *lr = arg2;
+ znode_t *zp;
+ nvlist_t *nvl;
+ size_t sa_size;
+ char *name;
+ char *value;
+ size_t size;
+ int error = 0;
+
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+ ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size);
+
+ ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa,
+ SPA_FEATURE_ZILSAXATTR));
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+ return (error);
+
+ rw_enter(&zp->z_xattr_lock, RW_WRITER);
+ mutex_enter(&zp->z_lock);
+ if (zp->z_xattr_cached == NULL)
+ error = zfs_sa_get_xattr(zp);
+ mutex_exit(&zp->z_lock);
+
+ if (error)
+ goto out;
+
+ ASSERT(zp->z_xattr_cached);
+ nvl = zp->z_xattr_cached;
+
+ /* Get xattr name, value and size from log record */
+ size = lr->lr_size;
+ name = (char *)(lr + 1);
+ if (size == 0) {
+ value = NULL;
+ error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
+ } else {
+ value = name + strlen(name) + 1;
+ /* Limited to 32k to keep nvpair memory allocations small */
+ if (size > DXATTR_MAX_ENTRY_SIZE) {
+ error = SET_ERROR(EFBIG);
+ goto out;
+ }
+
+ /* Prevent the DXATTR SA from consuming the entire SA region */
+ error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
+ if (error)
+ goto out;
+
+ if (sa_size > DXATTR_MAX_SA_SIZE) {
+ error = SET_ERROR(EFBIG);
+ goto out;
+ }
+
+ error = nvlist_add_byte_array(nvl, name, (uchar_t *)value,
+ size);
+ }
+
+ /*
+ * Update the SA for additions, modifications, and removals. On
+ * error drop the inconsistent cached version of the nvlist, it
+ * will be reconstructed from the ARC when next accessed.
+ */
+ if (error == 0)
+ error = zfs_sa_set_xattr(zp, name, value, size);
+
+ if (error) {
+ nvlist_free(nvl);
+ zp->z_xattr_cached = NULL;
+ }
+
+out:
+ rw_exit(&zp->z_xattr_lock);
+ zrele(zp);
+ return (error);
+}
+
+static int
zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
{
zfsvfs_t *zfsvfs = arg1;
lr_acl_v0_t *lr = arg2;
ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
- vsecattr_t vsa;
+ vsecattr_t vsa = {0};
znode_t *zp;
int error;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) +
+ sizeof (ace_t) * lr->lr_aclcnt);
+
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
zfs_oldace_byteswap(ace, lr->lr_aclcnt);
@@ -886,7 +1116,6 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
vsa.vsa_aclcnt = lr->lr_aclcnt;
vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
@@ -920,10 +1149,13 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
zfsvfs_t *zfsvfs = arg1;
lr_acl_t *lr = arg2;
ace_t *ace = (ace_t *)(lr + 1);
- vsecattr_t vsa;
+ vsecattr_t vsa = {0};
znode_t *zp;
int error;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes);
+
if (byteswap) {
byteswap_uint64_array(lr, sizeof (*lr));
zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
@@ -937,7 +1169,6 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
vsa.vsa_aclcnt = lr->lr_aclcnt;
vsa.vsa_aclentp = ace;
@@ -964,10 +1195,42 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}
+static int
+zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+ zfsvfs_t *zfsvfs = arg1;
+ lr_clone_range_t *lr = arg2;
+ znode_t *zp;
+ int error;
+
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+ lr_bps[lr->lr_nbps]));
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+ /*
+ * Clones can be logged out of order, so don't be surprised if
+ * the file is gone - just return success.
+ */
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+ }
+
+ error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length,
+ lr->lr_blksz, lr->lr_bps, lr->lr_nbps);
+
+ zrele(zp);
+ return (error);
+}
+
/*
* Callback vectors for replaying records
*/
-zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_error, /* no such type */
zfs_replay_create, /* TX_CREATE */
zfs_replay_create, /* TX_MKDIR */
@@ -989,4 +1252,8 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
zfs_replay_write2, /* TX_WRITE2 */
+ zfs_replay_setsaxattr, /* TX_SETSAXATTR */
+ zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */
+ zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */
+ zfs_replay_clone_range, /* TX_CLONE_RANGE */
};
diff --git a/sys/contrib/openzfs/module/zfs/zfs_rlock.c b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
index 06a5e031a7df..f42661df82e4 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_rlock.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/zfs_sa.c b/sys/contrib/openzfs/module/zfs/zfs_sa.c
index 67be131da63b..fb2443b756f8 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_sa.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_sa.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -29,6 +29,7 @@
#include <sys/zfs_sa.h>
#include <sys/dmu_objset.h>
#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
/*
* ZPL attribute registration table.
@@ -43,7 +44,7 @@
* this version of ZFS won't change or delete them.
*/
-sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+const sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
@@ -69,7 +70,10 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
{NULL, 0, 0, 0}
};
+
#ifdef _KERNEL
+static int zfs_zil_saxattr = 1;
+
int
zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio)
{
@@ -103,8 +107,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
if (len) {
- bcopy(link, (caddr_t)db->db_data +
- ZFS_OLD_ZNODE_PHYS_SIZE, len);
+ memcpy((caddr_t)db->db_data +
+ ZFS_OLD_ZNODE_PHYS_SIZE, link, len);
}
} else {
dmu_buf_t *dbp;
@@ -116,7 +120,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
dmu_buf_will_dirty(dbp, tx);
ASSERT3U(len, <=, dbp->db_size);
- bcopy(link, dbp->db_data, len);
+ memcpy(dbp->db_data, link, len);
dmu_buf_rele(dbp, FTAG);
}
}
@@ -219,13 +223,14 @@ zfs_sa_get_xattr(znode_t *zp)
}
int
-zfs_sa_set_xattr(znode_t *zp)
+zfs_sa_set_xattr(znode_t *zp, const char *name, const void *value, size_t vsize)
{
zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zilog_t *zilog;
dmu_tx_t *tx;
char *obj;
size_t size;
- int error;
+ int error, logsaxattr = 0;
ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
ASSERT(zp->z_xattr_cached);
@@ -244,6 +249,17 @@ zfs_sa_set_xattr(znode_t *zp)
if (error)
goto out_free;
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Users enable ZIL logging of xattr=sa operations by enabling the
+ * SPA_FEATURE_ZILSAXATTR feature on the pool. Feature is activated
+ * during zil_process_commit_list/zil_create, if enabled.
+ */
+ if (spa_feature_is_enabled(zfsvfs->z_os->os_spa,
+ SPA_FEATURE_ZILSAXATTR) && zfs_zil_saxattr)
+ logsaxattr = 1;
+
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa_create(tx, size);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
@@ -256,6 +272,10 @@ zfs_sa_set_xattr(znode_t *zp)
sa_bulk_attr_t bulk[2];
uint64_t ctime[2];
+ if (logsaxattr)
+ zfs_log_setsaxattr(zilog, tx, TX_SETSAXATTR, zp, name,
+ value, vsize);
+
zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs),
NULL, obj, size);
@@ -264,6 +284,8 @@ zfs_sa_set_xattr(znode_t *zp)
VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
dmu_tx_commit(tx);
+ if (logsaxattr && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
}
out_free:
vmem_free(obj, size);
@@ -396,8 +418,9 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
/* if scanstamp then add scanstamp */
if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
- bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
- scanstamp, AV_SCANSTAMP_SZ);
+ memcpy(scanstamp,
+ (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+ AV_SCANSTAMP_SZ);
SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
NULL, scanstamp, AV_SCANSTAMP_SZ);
zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
@@ -433,6 +456,9 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
}
}
+ZFS_MODULE_PARAM(zfs, zfs_, zil_saxattr, INT, ZMOD_RW,
+ "Disable xattr=sa extended attribute logging in ZIL by settng 0.");
+
EXPORT_SYMBOL(zfs_attr_table);
EXPORT_SYMBOL(zfs_sa_readlink);
EXPORT_SYMBOL(zfs_sa_symlink);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
index 7cbb70f499af..f3db953eab46 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -24,6 +24,7 @@
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -46,34 +47,53 @@
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
+#include <sys/dsl_crypt.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/policy.h>
+#include <sys/zfeature.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_quota.h>
#include <sys/zfs_vfsops.h>
#include <sys/zfs_znode.h>
+/*
+ * Enable the experimental block cloning feature. If this setting is 0, then
+ * even if feature@block_cloning is enabled, attempts to clone blocks will act
+ * as though the feature is disabled.
+ */
+int zfs_bclone_enabled = 1;
-static ulong_t zfs_fsync_sync_cnt = 4;
+/*
+ * When set zfs_clone_range() waits for dirty data to be written to disk.
+ * This allows the clone operation to reliably succeed when a file is modified
+ * and then immediately cloned. For small files this may be slower than making
+ * a copy of the file and is therefore not the default. However, in certain
+ * scenarios this behavior may be desirable so a tunable is provided.
+ */
+static int zfs_bclone_wait_dirty = 0;
+
+/*
+ * Maximum bytes to read per chunk in zfs_read().
+ */
+static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
int
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
{
+ int error = 0;
zfsvfs_t *zfsvfs = ZTOZSB(zp);
- (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+ atomic_inc_32(&zp->z_sync_writes_cnt);
zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
+ atomic_dec_32(&zp->z_sync_writes_cnt);
+ zfs_exit(zfsvfs, FTAG);
}
- tsd_set(zfs_fsyncer_key, NULL);
-
- return (0);
+ return (error);
}
@@ -102,10 +122,10 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
hole = B_FALSE;
/* Flush any mmap()'d data to disk */
- if (zn_has_cached_data(zp))
- zn_flush_cached_data(zp, B_FALSE);
+ if (zn_has_cached_data(zp, 0, file_sz - 1))
+ zn_flush_cached_data(zp, B_TRUE);
- lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER);
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
zfs_rangelock_exit(lr);
@@ -144,37 +164,44 @@ zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
zfsvfs_t *zfsvfs = ZTOZSB(zp);
int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
error = zfs_holey_common(zp, cmd, off);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
#endif /* SEEK_HOLE && SEEK_DATA */
-/*ARGSUSED*/
int
zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
{
zfsvfs_t *zfsvfs = ZTOZSB(zp);
int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
if (flag & V_ACE_MASK)
- error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+#if defined(__linux__)
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
+ zfs_init_idmap);
+#else
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
+ NULL);
+#endif
else
- error = zfs_zaccess_rwx(zp, mode, flag, cr);
+#if defined(__linux__)
+ error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
+#else
+ error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
+#endif
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
-static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
-
/*
* Read bytes from specified file into supplied buffer.
*
@@ -192,25 +219,25 @@ static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
* Side Effects:
* inode - atime updated if byte count > 0
*/
-/* ARGSUSED */
int
zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
{
+ (void) cr;
int error = 0;
boolean_t frsync = B_FALSE;
zfsvfs_t *zfsvfs = ZTOZSB(zp);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
if (zp->z_pflags & ZFS_AV_QUARANTINED) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EACCES));
}
/* We don't copy out anything useful for directories. */
if (Z_ISDIR(ZTOTYPE(zp))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EISDIR));
}
@@ -218,7 +245,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* Validate file offset
*/
if (zfs_uio_offset(uio) < (offset_t)0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -226,7 +253,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* Fasttrack empty reads
*/
if (zfs_uio_resid(uio) == 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -275,7 +302,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
error = mappedread_sf(zp, nbytes, uio);
else
#endif
- if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+ if (zn_has_cached_data(zp, zfs_uio_offset(uio),
+ zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
error = mappedread(zp, nbytes, uio);
} else {
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
@@ -311,10 +339,65 @@ out:
zfs_rangelock_exit(lr);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
+static void
+zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
+ uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx)
+{
+ zilog_t *zilog = zfsvfs->z_log;
+ const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
+
+ ASSERT(clear_setid_bits_txgp != NULL);
+ ASSERT(tx != NULL);
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the execute bits is set.
+ *
+ * It would be nice to do this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ *
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(zp, cr,
+ ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
+ uint64_t newmode;
+
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
+
+ mutex_exit(&zp->z_acl_lock);
+
+ /*
+ * Make sure SUID/SGID bits will be removed when we replay the
+ * log. If the setid bits are keep coming back, don't log more
+ * than one TX_SETATTR per transaction group.
+ */
+ if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
+ vattr_t va = {0};
+
+ va.va_mask = ATTR_MODE;
+ va.va_nodeid = zp->z_id;
+ va.va_mode = newmode;
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
+ ATTR_MODE, NULL);
+ *clear_setid_bits_txgp = dmu_tx_get_txg(tx);
+ }
+ } else {
+ mutex_exit(&zp->z_acl_lock);
+ }
+}
+
/*
* Write the bytes to a file.
*
@@ -333,13 +416,12 @@ out:
* Timestamps:
* ip - ctime|mtime updated if byte count > 0
*/
-
-/* ARGSUSED */
int
zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
{
- int error = 0;
+ int error = 0, error1;
ssize_t start_resid = zfs_uio_resid(uio);
+ uint64_t clear_setid_bits_txg = 0;
/*
* Fasttrack empty write
@@ -349,8 +431,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
return (0);
zfsvfs_t *zfsvfs = ZTOZSB(zp);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
sa_bulk_attr_t bulk[4];
int count = 0;
@@ -367,7 +449,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* so check it explicitly here.
*/
if (zfs_is_readonly(zfsvfs)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EROFS));
}
@@ -379,7 +461,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
if ((zp->z_pflags & ZFS_IMMUTABLE) ||
((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
(zfs_uio_offset(uio) < zp->z_size))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
}
@@ -388,19 +470,17 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
*/
offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
if (woff < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
- const uint64_t max_blksz = zfsvfs->z_max_blksz;
-
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
- * Skip this if uio contains loaned arc_buf.
*/
- if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
- ZFS_EXIT(zfsvfs);
+ ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
+ if (zfs_uio_prefaultpages(pfbytes, uio)) {
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFAULT));
}
@@ -433,9 +513,9 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
}
- if (zn_rlimit_fsize(zp, uio)) {
+ if (zn_rlimit_fsize_uio(zp, uio)) {
zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFBIG));
}
@@ -443,7 +523,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
if (woff >= limit) {
zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFBIG));
}
@@ -452,6 +532,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
uint64_t end_size = MAX(zp->z_size, woff + n);
zilog_t *zilog = zfsvfs->z_log;
+ boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) ||
+ (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS);
const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
@@ -474,10 +556,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break;
}
+ uint64_t blksz;
+ if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
+ if (zp->z_blksz > zfsvfs->z_max_blksz &&
+ !ISP2(zp->z_blksz)) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ blksz = 1 << highbit64(zp->z_blksz);
+ } else {
+ blksz = zfsvfs->z_max_blksz;
+ }
+ blksz = MIN(blksz, P2ROUNDUP(end_size,
+ SPA_MINBLOCKSIZE));
+ blksz = MAX(blksz, zp->z_blksz);
+ } else {
+ blksz = zp->z_blksz;
+ }
+
arc_buf_t *abuf = NULL;
- if (n >= max_blksz && woff >= zp->z_size &&
- P2PHASE(woff, max_blksz) == 0 &&
- zp->z_blksz == max_blksz) {
+ ssize_t nbytes = n;
+ if (n >= blksz && woff >= zp->z_size &&
+ P2PHASE(woff, blksz) == 0 &&
+ (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
/*
* This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter
@@ -485,18 +588,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping).
*/
- size_t cbytes;
-
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- max_blksz);
+ blksz);
ASSERT(abuf != NULL);
- ASSERT(arc_buf_size(abuf) == max_blksz);
- if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
- UIO_WRITE, uio, &cbytes))) {
+ ASSERT(arc_buf_size(abuf) == blksz);
+ if ((error = zfs_uiocopy(abuf->b_data, blksz,
+ UIO_WRITE, uio, &nbytes))) {
dmu_return_arcbuf(abuf);
break;
}
- ASSERT3S(cbytes, ==, max_blksz);
+ ASSERT3S(nbytes, ==, blksz);
+ } else {
+ nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
+ P2PHASE(woff, blksz));
+ if (pfbytes < nbytes) {
+ if (zfs_uio_prefaultpages(nbytes, uio)) {
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ pfbytes = nbytes;
+ }
}
/*
@@ -506,8 +617,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db);
- dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
- MIN(n, max_blksz));
+ dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
DB_DNODE_EXIT(db);
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -519,37 +629,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
}
/*
+ * NB: We must call zfs_clear_setid_bits_if_necessary before
+ * committing the transaction!
+ */
+
+ /*
* If rangelock_enter() over-locked we grow the blocksize
* and then reduce the lock range. This will only happen
* on the first iteration since rangelock_reduce() will
* shrink down lr_length to the appropriate size.
*/
if (lr->lr_length == UINT64_MAX) {
- uint64_t new_blksz;
-
- if (zp->z_blksz > max_blksz) {
- /*
- * File's blocksize is already larger than the
- * "recordsize" property. Only let it grow to
- * the next power of 2.
- */
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size,
- 1 << highbit64(zp->z_blksz));
- } else {
- new_blksz = MIN(end_size, max_blksz);
- }
- zfs_grow_blocksize(zp, new_blksz, tx);
+ zfs_grow_blocksize(zp, blksz, tx);
zfs_rangelock_reduce(lr, woff, n);
}
- /*
- * XXX - should we really limit each write to z_max_blksz?
- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
- */
- const ssize_t nbytes =
- MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
ssize_t tx_bytes;
if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio);
@@ -559,6 +653,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
zfs_uio_fault_disable(uio, B_FALSE);
#ifdef __linux__
if (error == EFAULT) {
+ zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
+ cr, &clear_setid_bits_txg, tx);
dmu_tx_commit(tx);
/*
* Account for partial writes before
@@ -567,30 +663,23 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* zfs_uio_prefaultpages, or prefaultpages may
* error, and we may break the loop early.
*/
- if (tx_bytes != zfs_uio_resid(uio))
- n -= tx_bytes - zfs_uio_resid(uio);
- if (zfs_uio_prefaultpages(MIN(n, max_blksz),
- uio)) {
- break;
- }
+ n -= tx_bytes - zfs_uio_resid(uio);
+ pfbytes -= tx_bytes - zfs_uio_resid(uio);
continue;
}
#endif
- if (error != 0) {
+ /*
+ * On FreeBSD, EFAULT should be propagated back to the
+ * VFS, which will handle faulting and will retry.
+ */
+ if (error != 0 && error != EFAULT) {
+ zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
+ cr, &clear_setid_bits_txg, tx);
dmu_tx_commit(tx);
break;
}
tx_bytes -= zfs_uio_resid(uio);
} else {
- /* Implied by abuf != NULL: */
- ASSERT3S(n, >=, max_blksz);
- ASSERT0(P2PHASE(woff, max_blksz));
- /*
- * We can simplify nbytes to MIN(n, max_blksz) since
- * P2PHASE(woff, max_blksz) is 0, and knowing
- * n >= max_blksz lets us simplify further:
- */
- ASSERT3S(nbytes, ==, max_blksz);
/*
* Thus, we're writing a full block at a block-aligned
* offset and extending the file past EOF.
@@ -601,6 +690,13 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
error = dmu_assign_arcbuf_by_dbuf(
sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
if (error != 0) {
+ /*
+ * XXX This might not be necessary if
+ * dmu_assign_arcbuf_by_dbuf is guaranteed
+ * to be atomic.
+ */
+ zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
+ cr, &clear_setid_bits_txg, tx);
dmu_return_arcbuf(abuf);
dmu_tx_commit(tx);
break;
@@ -609,7 +705,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
zfs_uioskip(uio, nbytes);
tx_bytes = nbytes;
}
- if (tx_bytes && zn_has_cached_data(zp) &&
+ if (tx_bytes &&
+ zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
!(ioflag & O_DIRECT)) {
update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
}
@@ -626,30 +723,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break;
}
- /*
- * Clear Set-UID/Set-GID bits on successful write if not
- * privileged and at least one of the execute bits is set.
- *
- * It would be nice to do this after all writes have
- * been done, but that would still expose the ISUID/ISGID
- * to another app after the partial write is committed.
- *
- * Note: we don't call zfs_fuid_map_id() here because
- * user 0 is not an ephemeral uid.
- */
- mutex_enter(&zp->z_acl_lock);
- if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
- (S_IXUSR >> 6))) != 0 &&
- (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(zp, cr,
- ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
- uint64_t newmode;
- zp->z_mode &= ~(S_ISUID | S_ISGID);
- newmode = zp->z_mode;
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
- (void *)&newmode, sizeof (uint64_t), tx);
- }
- mutex_exit(&zp->z_acl_lock);
+ zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
+ &clear_setid_bits_txg, tx);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
@@ -660,7 +735,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
(void) atomic_cas_64(&zp->z_size, end_size,
zfs_uio_offset(uio));
- ASSERT(error == 0);
+ ASSERT(error == 0 || error == EFAULT);
}
/*
* If we are replaying and eof is non zero then force
@@ -670,23 +745,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
zp->z_size = zfsvfs->z_replay_eof;
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ if (error1 != 0)
+ /* Avoid clobbering EFAULT. */
+ error = error1;
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+ /*
+ * NB: During replay, the TX_SETATTR record logged by
+ * zfs_clear_setid_bits_if_necessary must precede any of
+ * the TX_WRITE records logged here.
+ */
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
NULL, NULL);
+
dmu_tx_commit(tx);
if (error != 0)
break;
ASSERT3S(tx_bytes, ==, nbytes);
n -= nbytes;
-
- if (n > 0) {
- if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
- error = SET_ERROR(EFAULT);
- break;
- }
- }
+ pfbytes -= nbytes;
}
zfs_znode_update_vfs(zp);
@@ -699,23 +777,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
*/
if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
error == EFAULT) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
- if (ioflag & (O_SYNC | O_DSYNC) ||
- zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ if (commit)
zil_commit(zilog, zp->z_id);
const int64_t nwritten = start_resid - zfs_uio_resid(uio);
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
task_io_account_write(nwritten);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
-/*ARGSUSED*/
int
zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
{
@@ -723,32 +799,31 @@ zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
error = zfs_getacl(zp, vsecp, skipaclchk, cr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
-/*ARGSUSED*/
int
zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
{
zfsvfs_t *zfsvfs = ZTOZSB(zp);
int error;
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- zilog_t *zilog = zfsvfs->z_log;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ zilog_t *zilog;
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+ zilog = zfsvfs->z_log;
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -777,7 +852,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
uint64_t zp_gen;
ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
/*
@@ -804,7 +878,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
return (SET_ERROR(ENOENT));
}
- zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;
@@ -827,6 +901,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
+ ASSERT3P(zio, !=, NULL);
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
@@ -855,8 +930,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
}
#endif
if (error == 0)
- error = dmu_buf_hold(os, object, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
+ error = dmu_buf_hold_noread(os, object, offset, zgd,
+ &db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@@ -901,10 +976,10 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
}
-/* ARGSUSED */
static void
zfs_get_done(zgd_t *zgd, int error)
{
+ (void) error;
znode_t *zp = zgd->zgd_private;
if (zgd->zgd_db)
@@ -921,6 +996,551 @@ zfs_get_done(zgd_t *zgd, int error)
kmem_free(zgd, sizeof (zgd_t));
}
+static int
+zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+ int error;
+
+ /* Swap. Not sure if the order of zfs_enter()s is important. */
+ if (zfsvfs1 > zfsvfs2) {
+ zfsvfs_t *tmpzfsvfs;
+
+ tmpzfsvfs = zfsvfs2;
+ zfsvfs2 = zfsvfs1;
+ zfsvfs1 = tmpzfsvfs;
+ }
+
+ error = zfs_enter(zfsvfs1, tag);
+ if (error != 0)
+ return (error);
+ if (zfsvfs1 != zfsvfs2) {
+ error = zfs_enter(zfsvfs2, tag);
+ if (error != 0) {
+ zfs_exit(zfsvfs1, tag);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+static void
+zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+
+ zfs_exit(zfsvfs1, tag);
+ if (zfsvfs1 != zfsvfs2)
+ zfs_exit(zfsvfs2, tag);
+}
+
+/*
+ * We split each clone request in chunks that can fit into a single ZIL
+ * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
+ * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
+ * us room for storing 1022 block pointers.
+ *
+ * On success, the function return the number of bytes copied in *lenp.
+ * Note, it doesn't return how much bytes are left to be copied.
+ * On errors which are caused by any file system limitations or
+ * brt limitations `EINVAL` is returned. In the most cases a user
+ * requested bad parameters, it could be possible to clone the file but
+ * some parameters don't match the requirements.
+ */
+int
+zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
+ uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
+{
+ zfsvfs_t *inzfsvfs, *outzfsvfs;
+ objset_t *inos, *outos;
+ zfs_locked_range_t *inlr, *outlr;
+ dmu_buf_impl_t *db;
+ dmu_tx_t *tx;
+ zilog_t *zilog;
+ uint64_t inoff, outoff, len, done;
+ uint64_t outsize, size;
+ int error;
+ int count = 0;
+ sa_bulk_attr_t bulk[3];
+ uint64_t mtime[2], ctime[2];
+ uint64_t uid, gid, projid;
+ blkptr_t *bps;
+ size_t maxblocks, nbps;
+ uint_t inblksz;
+ uint64_t clear_setid_bits_txg = 0;
+ uint64_t last_synced_txg = 0;
+
+ inoff = *inoffp;
+ outoff = *outoffp;
+ len = *lenp;
+ done = 0;
+
+ inzfsvfs = ZTOZSB(inzp);
+ outzfsvfs = ZTOZSB(outzp);
+
+ /*
+ * We need to call zfs_enter() potentially on two different datasets,
+ * so we need a dedicated function for that.
+ */
+ error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
+ if (error != 0)
+ return (error);
+
+ inos = inzfsvfs->z_os;
+ outos = outzfsvfs->z_os;
+
+ /*
+ * Both source and destination have to belong to the same storage pool.
+ */
+ if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * outos and inos belongs to the same storage pool.
+ * see a few lines above, only one check.
+ */
+ if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+ SPA_FEATURE_BLOCK_CLONING)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+ ASSERT(!outzfsvfs->z_replay);
+
+ /*
+ * Block cloning from an unencrypted dataset into an encrypted
+ * dataset and vice versa is not supported.
+ */
+ if (inos->os_encrypted != outos->os_encrypted) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * Cloning across encrypted datasets is possible only if they
+ * share the same master key.
+ */
+ if (inos != outos && inos->os_encrypted &&
+ !dmu_objset_crypto_key_equal(inos, outos)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EXDEV));
+ }
+
+ error = zfs_verify_zp(inzp);
+ if (error == 0)
+ error = zfs_verify_zp(outzp);
+ if (error != 0) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (error);
+ }
+
+ /*
+ * We don't copy source file's flags that's why we don't allow to clone
+ * files that are in quarantine.
+ */
+ if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EACCES));
+ }
+
+ if (inoff >= inzp->z_size) {
+ *lenp = 0;
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (0);
+ }
+ if (len > inzp->z_size - inoff) {
+ len = inzp->z_size - inoff;
+ }
+ if (len == 0) {
+ *lenp = 0;
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (0);
+ }
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(outzfsvfs)) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM.
+ * Intentionally allow ZFS_READONLY through here.
+ * See zfs_zaccess_common()
+ */
+ if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * No overlapping if we are cloning within the same file.
+ */
+ if (inzp == outzp) {
+ if (inoff < outoff + len && outoff < inoff + len) {
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ /* Flush any mmap()'d data to disk */
+ if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
+ zn_flush_cached_data(inzp, B_TRUE);
+
+ /*
+ * Maintain predictable lock order.
+ */
+ if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
+ inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+ RL_READER);
+ outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+ RL_WRITER);
+ } else {
+ outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+ RL_WRITER);
+ inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+ RL_READER);
+ }
+
+ inblksz = inzp->z_blksz;
+
+ /*
+ * We cannot clone into a file with different block size if we can't
+ * grow it (block size is already bigger, has more than one block, or
+ * not locked for growth). There are other possible reasons for the
+ * grow to fail, but we cover what we can before opening transaction
+ * and the rest detect after we try to do it.
+ */
+ if (inblksz < outzp->z_blksz) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+ if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
+ outlr->lr_length != UINT64_MAX)) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ /*
+ * Block size must be power-of-2 if destination offset != 0.
+ * There can be no multiple blocks of non-power-of-2 size.
+ */
+ if (outoff != 0 && !ISP2(inblksz)) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ /*
+ * Offsets and len must be at block boundries.
+ */
+ if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+ /*
+ * Length must be multipe of blksz, except for the end of the file.
+ */
+ if ((len % inblksz) != 0 &&
+ (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ /*
+ * If we are copying only one block and it is smaller than recordsize
+ * property, do not allow destination to grow beyond one block if it
+ * is not there yet. Otherwise the destination will get stuck with
+ * that block size forever, that can be as small as 512 bytes, no
+ * matter how big the destination grow later.
+ */
+ if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz &&
+ outzp->z_size <= inblksz && outoff + len > inblksz) {
+ error = SET_ERROR(EINVAL);
+ goto unlock;
+ }
+
+ error = zn_rlimit_fsize(outoff + len);
+ if (error != 0) {
+ goto unlock;
+ }
+
+ if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
+ error = SET_ERROR(EFBIG);
+ goto unlock;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
+ &outzp->z_size, 8);
+
+ zilog = outzfsvfs->z_log;
+ maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
+ sizeof (bps[0]);
+
+ uid = KUID_TO_SUID(ZTOUID(outzp));
+ gid = KGID_TO_SGID(ZTOGID(outzp));
+ projid = outzp->z_projid;
+
+ bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+
+ /*
+ * Clone the file in reasonable size chunks. Each chunk is cloned
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (len > 0) {
+ size = MIN(inblksz * maxblocks, len);
+
+ if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
+ uid) ||
+ zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
+ gid) ||
+ (projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
+ projid))) {
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ nbps = maxblocks;
+ last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
+ error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
+ &nbps);
+ if (error != 0) {
+ /*
+ * If we are trying to clone a block that was created
+ * in the current transaction group, the error will be
+ * EAGAIN here. Based on zfs_bclone_wait_dirty either
+ * return a shortened range to the caller so it can
+ * fallback, or wait for the next TXG and check again.
+ */
+ if (error == EAGAIN && zfs_bclone_wait_dirty) {
+ txg_wait_synced(dmu_objset_pool(inos),
+ last_synced_txg + 1);
+ continue;
+ }
+
+ break;
+ }
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(outos);
+ dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
+ db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, outzp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ break;
+ }
+
+ /*
+ * Copy source znode's block size. This is done only if the
+ * whole znode is locked (see zfs_rangelock_cb()) and only
+ * on the first iteration since zfs_rangelock_reduce() will
+ * shrink down lr_length to the appropriate size.
+ */
+ if (outlr->lr_length == UINT64_MAX) {
+ zfs_grow_blocksize(outzp, inblksz, tx);
+
+ /*
+ * Block growth may fail for many reasons we can not
+ * predict here. If it happen the cloning is doomed.
+ */
+ if (inblksz != outzp->z_blksz) {
+ error = SET_ERROR(EINVAL);
+ dmu_tx_abort(tx);
+ break;
+ }
+
+ /*
+ * Round range lock up to the block boundary, so we
+ * prevent appends until we are done.
+ */
+ zfs_rangelock_reduce(outlr, outoff,
+ ((len - 1) / inblksz + 1) * inblksz);
+ }
+
+ error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
+ bps, nbps);
+ if (error != 0) {
+ dmu_tx_commit(tx);
+ break;
+ }
+
+ if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) {
+ update_pages(outzp, outoff, size, outos);
+ }
+
+ zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
+ &clear_setid_bits_txg, tx);
+
+ zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((outsize = outzp->z_size) < outoff + size) {
+ (void) atomic_cas_64(&outzp->z_size, outsize,
+ outoff + size);
+ }
+
+ error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
+ size, inblksz, bps, nbps);
+
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+
+ inoff += size;
+ outoff += size;
+ len -= size;
+ done += size;
+
+ if (issig()) {
+ error = SET_ERROR(EINTR);
+ break;
+ }
+ }
+
+ vmem_free(bps, sizeof (bps[0]) * maxblocks);
+ zfs_znode_update_vfs(outzp);
+
+unlock:
+ zfs_rangelock_exit(outlr);
+ zfs_rangelock_exit(inlr);
+
+ if (done > 0) {
+ /*
+ * If we have made at least partial progress, reset the error.
+ */
+ error = 0;
+
+ ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
+
+ if (outos->os_sync == ZFS_SYNC_ALWAYS) {
+ zil_commit(zilog, outzp->z_id);
+ }
+
+ *inoffp += done;
+ *outoffp += done;
+ *lenp = done;
+ } else {
+ /*
+ * If we made no progress, there must be a good reason.
+ * EOF is handled explicitly above, before the loop.
+ */
+ ASSERT3S(error, !=, 0);
+ }
+
+ zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+
+ return (error);
+}
+
+/*
+ * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
+ * but we cannot do that, because when replaying we don't have source znode
+ * available. This is why we need a dedicated replay function.
+ */
+int
+zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
+ const blkptr_t *bps, size_t nbps)
+{
+ zfsvfs_t *zfsvfs;
+ dmu_buf_impl_t *db;
+ dmu_tx_t *tx;
+ int error;
+ int count = 0;
+ sa_bulk_attr_t bulk[3];
+ uint64_t mtime[2], ctime[2];
+
+ ASSERT3U(off, <, MAXOFFSET_T);
+ ASSERT3U(len, >, 0);
+ ASSERT3U(nbps, >, 0);
+
+ zfsvfs = ZTOZSB(zp);
+
+ ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+ SPA_FEATURE_BLOCK_CLONING));
+
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+
+ ASSERT(zfsvfs->z_replay);
+ ASSERT(!zfs_is_readonly(zfsvfs));
+
+ if ((off % blksz) != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+
+ /*
+ * Start a transaction.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
+
+ if (zp->z_blksz < blksz)
+ zfs_grow_blocksize(zp, blksz, tx);
+
+ dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+ if (zp->z_size < off + len)
+ zp->z_size = off + len;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ /*
+ * zil_replaying() not only check if we are replaying ZIL, but also
+ * updates the ZIL header to record replay progress.
+ */
+ VERIFY(zil_replaying(zfsvfs->z_log, tx));
+
+ dmu_tx_commit(tx);
+
+ zfs_znode_update_vfs(zp);
+
+ zfs_exit(zfsvfs, FTAG);
+
+ return (error);
+}
+
EXPORT_SYMBOL(zfs_access);
EXPORT_SYMBOL(zfs_fsync);
EXPORT_SYMBOL(zfs_holey);
@@ -928,6 +1548,14 @@ EXPORT_SYMBOL(zfs_read);
EXPORT_SYMBOL(zfs_write);
EXPORT_SYMBOL(zfs_getsecattr);
EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_clone_range);
+EXPORT_SYMBOL(zfs_clone_range_replay);
-ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
"Bytes to read per chunk");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
+ "Enable block cloning");
+
+ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
+ "Wait for dirty blocks when cloning");
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 640e805d093a..34be54b337fd 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -43,6 +43,8 @@
#include <sys/metaslab.h>
#include <sys/trace_zfs.h>
#include <sys/abd.h>
+#include <sys/brt.h>
+#include <sys/wmsum.h>
/*
* The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
@@ -89,12 +91,12 @@
* committed to stable storage. Please refer to the zil_commit_waiter()
* function (and the comments within it) for more details.
*/
-int zfs_commit_timeout_pct = 5;
+static uint_t zfs_commit_timeout_pct = 10;
/*
* See zil.h for more information about these fields.
*/
-zil_stats_t zil_stats = {
+static zil_kstat_values_t zil_stats = {
{ "zil_commit_count", KSTAT_DATA_UINT64 },
{ "zil_commit_writer_count", KSTAT_DATA_UINT64 },
{ "zil_itx_count", KSTAT_DATA_UINT64 },
@@ -106,11 +108,16 @@ zil_stats_t zil_stats = {
{ "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 },
+ { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 },
};
-static kstat_t *zil_ksp;
+static zil_sums_t zil_sums_global;
+static kstat_t *zil_kstats_global;
/*
* Disable intent logging replay. This global ZIL switch affects all pools.
@@ -118,25 +125,25 @@ static kstat_t *zil_ksp;
int zil_replay_disable = 0;
/*
- * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
- * the disk(s) by the ZIL after an LWB write has completed. Setting this
- * will cause ZIL corruption on power loss if a volatile out-of-order
- * write cache is enabled.
+ * Disable the flush commands that are normally sent to the disk(s) by the ZIL
+ * after an LWB write has completed. Setting this will cause ZIL corruption on
+ * power loss if a volatile out-of-order write cache is enabled.
*/
-int zil_nocacheflush = 0;
+static int zil_nocacheflush = 0;
/*
* Limit SLOG write size per commit executed with synchronous priority.
* Any writes above that will be executed with lower (asynchronous) priority
* to limit potential SLOG device abuse by single active ZIL writer.
*/
-unsigned long zil_slog_bulk = 768 * 1024;
+static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache;
-#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
- sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
+static itx_t *zil_itx_clone(itx_t *oitx);
+static uint64_t zil_max_waste_space(zilog_t *zilog);
static int
zil_bp_compare(const void *x1, const void *x2)
@@ -213,16 +220,30 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
}
+static int
+zil_kstats_global_update(kstat_t *ksp, int rw)
+{
+ zil_kstat_values_t *zs = ksp->ks_data;
+ ASSERT3P(&zil_stats, ==, zs);
+
+ if (rw == KSTAT_WRITE) {
+ return (SET_ERROR(EACCES));
+ }
+
+ zil_kstat_values_update(zs, &zil_sums_global);
+
+ return (0);
+}
+
/*
* Read a log block and make sure it's valid.
*/
static int
zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
- blkptr_t *nbp, void *dst, char **end)
+ blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
{
- enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t aflags = ARC_FLAG_WAIT;
- arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;
@@ -239,7 +260,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
- &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+ abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
zio_cksum_t cksum = bp->blk_cksum;
@@ -254,39 +275,35 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
+ uint64_t size = BP_GET_LSIZE(bp);
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
- zil_chain_t *zilc = abuf->b_data;
+ zil_chain_t *zilc = (*abuf)->b_data;
char *lr = (char *)(zilc + 1);
- uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
- if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
- sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+ if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) ||
+ zilc->zc_nused < sizeof (*zilc) ||
+ zilc->zc_nused > size) {
error = SET_ERROR(ECKSUM);
} else {
- ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
- bcopy(lr, dst, len);
- *end = (char *)dst + len;
+ *begin = lr;
+ *end = lr + zilc->zc_nused - sizeof (*zilc);
*nbp = zilc->zc_next_blk;
}
} else {
- char *lr = abuf->b_data;
- uint64_t size = BP_GET_LSIZE(bp);
+ char *lr = (*abuf)->b_data;
zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
- if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
- sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+ if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) ||
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
- ASSERT3U(zilc->zc_nused, <=,
- SPA_OLD_MAXBLOCKSIZE);
- bcopy(lr, dst, zilc->zc_nused);
- *end = (char *)dst + zilc->zc_nused;
+ *begin = lr;
+ *end = lr + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
}
}
-
- arc_buf_destroy(abuf, &abuf);
}
return (error);
@@ -298,7 +315,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
static int
zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
{
- enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
const blkptr_t *bp = &lr->lr_blkptr;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
@@ -307,7 +324,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
if (BP_IS_HOLE(bp)) {
if (wbuf != NULL)
- bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+ memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
return (0);
}
@@ -322,6 +339,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
if (wbuf == NULL)
zio_flags |= ZIO_FLAG_RAW;
+ ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
@@ -330,13 +348,96 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
if (error == 0) {
if (wbuf != NULL)
- bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+ memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
arc_buf_destroy(abuf, &abuf);
}
return (error);
}
+void
+zil_sums_init(zil_sums_t *zs)
+{
+ wmsum_init(&zs->zil_commit_count, 0);
+ wmsum_init(&zs->zil_commit_writer_count, 0);
+ wmsum_init(&zs->zil_itx_count, 0);
+ wmsum_init(&zs->zil_itx_indirect_count, 0);
+ wmsum_init(&zs->zil_itx_indirect_bytes, 0);
+ wmsum_init(&zs->zil_itx_copied_count, 0);
+ wmsum_init(&zs->zil_itx_copied_bytes, 0);
+ wmsum_init(&zs->zil_itx_needcopy_count, 0);
+ wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
+ wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
+ wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
+ wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
+ wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
+ wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
+ wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
+ wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
+ wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
+}
+
+void
+zil_sums_fini(zil_sums_t *zs)
+{
+ wmsum_fini(&zs->zil_commit_count);
+ wmsum_fini(&zs->zil_commit_writer_count);
+ wmsum_fini(&zs->zil_itx_count);
+ wmsum_fini(&zs->zil_itx_indirect_count);
+ wmsum_fini(&zs->zil_itx_indirect_bytes);
+ wmsum_fini(&zs->zil_itx_copied_count);
+ wmsum_fini(&zs->zil_itx_copied_bytes);
+ wmsum_fini(&zs->zil_itx_needcopy_count);
+ wmsum_fini(&zs->zil_itx_needcopy_bytes);
+ wmsum_fini(&zs->zil_itx_metaslab_normal_count);
+ wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
+ wmsum_fini(&zs->zil_itx_metaslab_normal_write);
+ wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
+ wmsum_fini(&zs->zil_itx_metaslab_slog_count);
+ wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
+ wmsum_fini(&zs->zil_itx_metaslab_slog_write);
+ wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
+}
+
+void
+zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
+{
+ zs->zil_commit_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_commit_count);
+ zs->zil_commit_writer_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_commit_writer_count);
+ zs->zil_itx_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_count);
+ zs->zil_itx_indirect_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_indirect_count);
+ zs->zil_itx_indirect_bytes.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_indirect_bytes);
+ zs->zil_itx_copied_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_copied_count);
+ zs->zil_itx_copied_bytes.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_copied_bytes);
+ zs->zil_itx_needcopy_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_needcopy_count);
+ zs->zil_itx_needcopy_bytes.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
+ zs->zil_itx_metaslab_normal_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
+ zs->zil_itx_metaslab_normal_bytes.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
+ zs->zil_itx_metaslab_normal_write.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
+ zs->zil_itx_metaslab_normal_alloc.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
+ zs->zil_itx_metaslab_slog_count.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
+ zs->zil_itx_metaslab_slog_bytes.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
+ zs->zil_itx_metaslab_slog_write.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
+ zs->zil_itx_metaslab_slog_alloc.value.ui64 =
+ wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
+}
+
/*
* Parse the intent log, and call parse_func for each valid record within.
*/
@@ -353,12 +454,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
uint64_t max_lr_seq = 0;
uint64_t blk_count = 0;
uint64_t lr_count = 0;
- blkptr_t blk, next_blk;
- char *lrbuf, *lrp;
+ blkptr_t blk, next_blk = {{{{0}}}};
int error = 0;
- bzero(&next_blk, sizeof (blkptr_t));
-
/*
* Old logs didn't record the maximum zh_claim_lr_seq.
*/
@@ -374,13 +472,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
int reclen;
- char *end = NULL;
+ char *lrp, *end;
+ arc_buf_t *abuf = NULL;
if (blk_seq > claim_blk_seq)
break;
@@ -396,24 +494,42 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
break;
error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
- lrbuf, &end);
- if (error != 0)
+ &lrp, &end, &abuf);
+ if (error != 0) {
+ if (abuf)
+ arc_buf_destroy(abuf, &abuf);
+ if (claimed) {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+
+ dmu_objset_name(zilog->zl_os, name);
+
+ cmn_err(CE_WARN, "ZFS read log block error %d, "
+ "dataset %s, seq 0x%llx\n", error, name,
+ (u_longlong_t)blk_seq);
+ }
break;
+ }
- for (lrp = lrbuf; lrp < end; lrp += reclen) {
+ for (; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
- if (lr->lrc_seq > claim_lr_seq)
+ ASSERT3U(reclen, <=, end - lrp);
+ if (lr->lrc_seq > claim_lr_seq) {
+ arc_buf_destroy(abuf, &abuf);
goto done;
+ }
error = parse_lr_func(zilog, lr, arg, txg);
- if (error != 0)
+ if (error != 0) {
+ arc_buf_destroy(abuf, &abuf);
goto done;
+ }
ASSERT3U(max_lr_seq, <, lr->lrc_seq);
max_lr_seq = lr->lrc_seq;
lr_count++;
}
+ arc_buf_destroy(abuf, &abuf);
}
done:
zilog->zl_parse_error = error;
@@ -422,21 +538,16 @@ done:
zilog->zl_parse_blk_count = blk_count;
zilog->zl_parse_lr_count = lr_count;
- ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
- (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
- (decrypt && error == EIO));
-
zil_bp_tree_fini(zilog);
- zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
-/* ARGSUSED */
static int
zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
uint64_t first_txg)
{
+ (void) tx;
ASSERT(!BP_IS_HOLE(bp));
/*
@@ -445,7 +556,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* that we rewind to is invalid. Thus, we return -1 so
* zil_parse() doesn't attempt to read it.
*/
- if (bp->blk_birth >= first_txg)
+ if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
return (-1);
if (zil_bp_tree_add(zilog, bp) != 0)
@@ -455,11 +566,11 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
return (0);
}
-/* ARGSUSED */
static int
zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
uint64_t first_txg)
{
+ (void) zilog, (void) lrc, (void) tx, (void) first_txg;
return (0);
}
@@ -471,7 +582,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
- if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+ if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
zil_bp_tree_add(zilog, bp) != 0)
return (0);
@@ -481,14 +592,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
}
static int
-zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
- uint64_t first_txg)
+zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
int error;
- if (lrc->lrc_txtype != TX_WRITE)
- return (0);
+ ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
/*
* If the block is not readable, don't claim it. This can happen
@@ -498,7 +607,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
- if (lr->lr_blkptr.blk_birth >= first_txg) {
+ if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
error = zil_read_log_data(zilog, lr, NULL);
if (error != 0)
return (error);
@@ -507,35 +616,156 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
-/* ARGSUSED */
+static int
+zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t first_txg)
+{
+ const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+ const blkptr_t *bp;
+ spa_t *spa = zilog->zl_spa;
+ uint_t ii;
+
+ ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+ ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+ lr_bps[lr->lr_nbps]));
+
+ if (tx == NULL) {
+ return (0);
+ }
+
+ /*
+ * XXX: Do we need to byteswap lr?
+ */
+
+ for (ii = 0; ii < lr->lr_nbps; ii++) {
+ bp = &lr->lr_bps[ii];
+
+ /*
+ * When data is embedded into the BP there is no need to create
+ * BRT entry as there is no data block. Just copy the BP as it
+ * contains the data.
+ */
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ continue;
+
+ /*
+ * We can not handle block pointers from the future, since they
+ * are not yet allocated. It should not normally happen, but
+ * just in case lets be safe and just stop here now instead of
+ * corrupting the pool.
+ */
+ if (BP_GET_BIRTH(bp) >= first_txg)
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * Assert the block is really allocated before we reference it.
+ */
+ metaslab_check_free(spa, bp);
+ }
+
+ for (ii = 0; ii < lr->lr_nbps; ii++) {
+ bp = &lr->lr_bps[ii];
+ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp))
+ brt_pending_add(spa, bp, tx);
+ }
+
+ return (0);
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t first_txg)
+{
+
+ switch (lrc->lrc_txtype) {
+ case TX_WRITE:
+ return (zil_claim_write(zilog, lrc, tx, first_txg));
+ case TX_CLONE_RANGE:
+ return (zil_claim_clone_range(zilog, lrc, tx, first_txg));
+ default:
+ return (0);
+ }
+}
+
static int
zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
uint64_t claim_txg)
{
+ (void) claim_txg;
+
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
return (0);
}
static int
-zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
- uint64_t claim_txg)
+zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
{
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
+ ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+
/*
* If we previously claimed it, we need to free it.
*/
- if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
- bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
- !BP_IS_HOLE(bp))
+ if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+ zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+ }
return (0);
}
static int
+zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+ const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+ const blkptr_t *bp;
+ spa_t *spa;
+ uint_t ii;
+
+ ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+ ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+ lr_bps[lr->lr_nbps]));
+
+ if (tx == NULL) {
+ return (0);
+ }
+
+ spa = zilog->zl_spa;
+
+ for (ii = 0; ii < lr->lr_nbps; ii++) {
+ bp = &lr->lr_bps[ii];
+
+ if (!BP_IS_HOLE(bp)) {
+ zio_free(spa, dmu_tx_get_txg(tx), bp);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+ uint64_t claim_txg)
+{
+
+ if (claim_txg == 0) {
+ return (0);
+ }
+
+ switch (lrc->lrc_txtype) {
+ case TX_WRITE:
+ return (zil_free_write(zilog, lrc, tx, claim_txg));
+ case TX_CLONE_RANGE:
+ return (zil_free_clone_range(zilog, lrc, tx));
+ default:
+ return (0);
+ }
+}
+
+static int
zil_lwb_vdev_compare(const void *x1, const void *x2)
{
const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
@@ -544,41 +774,54 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
return (TREE_CMP(v1, v2));
}
+/*
+ * Allocate a new lwb. We may already have a block pointer for it, in which
+ * case we get size and version from there. Or we may not yet, in which case
+ * we choose them here and later make the block allocation match.
+ */
static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
- boolean_t fastwrite)
+zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
+ uint64_t txg, lwb_state_t state)
{
lwb_t *lwb;
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb->lwb_zilog = zilog;
- lwb->lwb_blk = *bp;
- lwb->lwb_fastwrite = fastwrite;
+ if (bp) {
+ lwb->lwb_blk = *bp;
+ lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
+ sz = BP_GET_LSIZE(bp);
+ } else {
+ BP_ZERO(&lwb->lwb_blk);
+ lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
+ SPA_VERSION_SLIM_ZIL);
+ }
lwb->lwb_slog = slog;
- lwb->lwb_state = LWB_STATE_CLOSED;
- lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
- lwb->lwb_max_txg = txg;
+ lwb->lwb_error = 0;
+ if (lwb->lwb_slim) {
+ lwb->lwb_nmax = sz;
+ lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
+ } else {
+ lwb->lwb_nmax = sz - sizeof (zil_chain_t);
+ lwb->lwb_nused = lwb->lwb_nfilled = 0;
+ }
+ lwb->lwb_sz = sz;
+ lwb->lwb_state = state;
+ lwb->lwb_buf = zio_buf_alloc(sz);
+ lwb->lwb_child_zio = NULL;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
- lwb->lwb_tx = NULL;
lwb->lwb_issued_timestamp = 0;
- if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
- lwb->lwb_nused = sizeof (zil_chain_t);
- lwb->lwb_sz = BP_GET_LSIZE(bp);
- } else {
- lwb->lwb_nused = 0;
- lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
- }
+ lwb->lwb_issued_txg = 0;
+ lwb->lwb_alloc_txg = txg;
+ lwb->lwb_max_txg = 0;
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
+ if (state != LWB_STATE_NEW)
+ zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
- ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
- ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
- VERIFY(list_is_empty(&lwb->lwb_waiters));
- VERIFY(list_is_empty(&lwb->lwb_itxs));
-
return (lwb);
}
@@ -586,15 +829,17 @@ static void
zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
{
ASSERT(MUTEX_HELD(&zilog->zl_lock));
- ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
- VERIFY(list_is_empty(&lwb->lwb_waiters));
- VERIFY(list_is_empty(&lwb->lwb_itxs));
- ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+ ASSERT3P(lwb->lwb_child_zio, ==, NULL);
ASSERT3P(lwb->lwb_write_zio, ==, NULL);
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+ ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
- ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
- lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+ VERIFY(list_is_empty(&lwb->lwb_itxs));
+ VERIFY(list_is_empty(&lwb->lwb_waiters));
+ ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+ ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
/*
* Clear the zilog's field to indicate this lwb is no longer
@@ -663,6 +908,36 @@ zilog_is_dirty(zilog_t *zilog)
}
/*
+ * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
+ * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
+ * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
+ * zil_commit.
+ */
+static void
+zil_commit_activate_saxattr_feature(zilog_t *zilog)
+{
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+ uint64_t txg = 0;
+ dmu_tx_t *tx = NULL;
+
+ if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
+ dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
+ !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_dataset_dirty(ds, tx);
+ txg = dmu_tx_get_txg(tx);
+
+ mutex_enter(&ds->ds_lock);
+ ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
+ (void *)B_TRUE;
+ mutex_exit(&ds->ds_lock);
+ dmu_tx_commit(tx);
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ }
+}
+
+/*
* Create an on-disk intent log.
*/
static lwb_t *
@@ -674,8 +949,9 @@ zil_create(zilog_t *zilog)
dmu_tx_t *tx = NULL;
blkptr_t blk;
int error = 0;
- boolean_t fastwrite = FALSE;
boolean_t slog = FALSE;
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
/*
* Wait for any previous destroy to complete.
@@ -705,8 +981,6 @@ zil_create(zilog_t *zilog)
error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
ZIL_MIN_BLKSZ, &slog);
- fastwrite = TRUE;
-
if (error == 0)
zil_init_log_chain(zilog, &blk);
}
@@ -715,7 +989,7 @@ zil_create(zilog_t *zilog)
* Allocate a log write block (lwb) for the first log block.
*/
if (error == 0)
- lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite);
+ lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
/*
* If we just allocated the first log block, commit our transaction
@@ -723,11 +997,35 @@ zil_create(zilog_t *zilog)
* (zh is part of the MOS, so we cannot modify it in open context.)
*/
if (tx != NULL) {
+ /*
+ * If "zilsaxattr" feature is enabled on zpool, then activate
+ * it now when we're creating the ZIL chain. We can't wait with
+ * this until we write the first xattr log record because we
+ * need to wait for the feature activation to sync out.
+ */
+ if (spa_feature_is_enabled(zilog->zl_spa,
+ SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
+ DMU_OST_ZVOL) {
+ mutex_enter(&ds->ds_lock);
+ ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
+ (void *)B_TRUE;
+ mutex_exit(&ds->ds_lock);
+ }
+
dmu_tx_commit(tx);
txg_wait_synced(zilog->zl_dmu_pool, txg);
+ } else {
+ /*
+ * This branch covers the case where we enable the feature on a
+ * zpool that has existing ZIL headers.
+ */
+ zil_commit_activate_saxattr_feature(zilog);
}
+ IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
+ dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
+ dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
- ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+ ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
IMPLY(error == 0, lwb != NULL);
return (lwb);
@@ -741,8 +1039,9 @@ zil_create(zilog_t *zilog)
* txg_wait_synced() here either when keep_first is set, because both
* zil_create() and zil_destroy() will wait for any in-progress destroys
* to complete.
+ * Return B_TRUE if there were any entries to replay.
*/
-void
+boolean_t
zil_destroy(zilog_t *zilog, boolean_t keep_first)
{
const zil_header_t *zh = zilog->zl_header;
@@ -758,7 +1057,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
zilog->zl_old_header = *zh; /* debugging aid */
if (BP_IS_HOLE(&zh->zh_log))
- return;
+ return (B_FALSE);
tx = dmu_tx_create(zilog->zl_os);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
@@ -774,15 +1073,11 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
if (!list_is_empty(&zilog->zl_lwb_list)) {
ASSERT(zh->zh_claim_txg == 0);
VERIFY(!keep_first);
- while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
- if (lwb->lwb_fastwrite)
- metaslab_fastwrite_unmark(zilog->zl_spa,
- &lwb->lwb_blk);
-
- list_remove(&zilog->zl_lwb_list, lwb);
+ while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+ if (!BP_IS_HOLE(&lwb->lwb_blk))
+ zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
zil_free_lwb(zilog, lwb);
}
} else if (!keep_first) {
@@ -791,6 +1086,8 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
mutex_exit(&zilog->zl_lock);
dmu_tx_commit(tx);
+
+ return (B_TRUE);
}
void
@@ -911,10 +1208,10 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
-/* ARGSUSED */
int
zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
{
+ (void) dp;
zilog_t *zilog;
objset_t *os;
blkptr_t *bp;
@@ -1008,21 +1305,21 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
{
/*
* The lwb_waiters field of the lwb is protected by the zilog's
- * zl_lock, thus it must be held when calling this function.
+ * zl_issuer_lock while the lwb is open and zl_lock otherwise.
+ * zl_issuer_lock also protects leaving the open state.
+ * zcw_lwb setting is protected by zl_issuer_lock and state !=
+ * flush_done, which transition is protected by zl_lock.
*/
- ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
+ IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
+ MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
- mutex_enter(&zcw->zcw_lock);
ASSERT(!list_link_active(&zcw->zcw_node));
- ASSERT3P(zcw->zcw_lwb, ==, NULL);
- ASSERT3P(lwb, !=, NULL);
- ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
- lwb->lwb_state == LWB_STATE_ISSUED ||
- lwb->lwb_state == LWB_STATE_WRITE_DONE);
-
list_insert_tail(&lwb->lwb_waiters, zcw);
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
zcw->zcw_lwb = lwb;
- mutex_exit(&zcw->zcw_lock);
}
/*
@@ -1033,11 +1330,9 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
static void
zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
{
- mutex_enter(&zcw->zcw_lock);
ASSERT(!list_link_active(&zcw->zcw_node));
- ASSERT3P(zcw->zcw_lwb, ==, NULL);
list_insert_tail(nolwb, zcw);
- mutex_exit(&zcw->zcw_lock);
+ ASSERT3P(zcw->zcw_lwb, ==, NULL);
}
void
@@ -1049,6 +1344,9 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
int ndvas = BP_GET_NDVAS(bp);
int i;
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
if (zil_nocacheflush)
return;
@@ -1107,48 +1405,33 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
}
/*
- * This function is a called after all vdevs associated with a given lwb
- * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zil_nocacheflush" is set. Further,
- * all "previous" lwb's will have completed before this function is
- * called; i.e. this function is called for all previous lwbs before
- * it's called for "this" lwb (enforced via zio the dependencies
- * configured in zil_lwb_set_zio_dependency()).
+ * This function is a called after all vdevs associated with a given lwb write
+ * have completed their flush command; or as soon as the lwb write completes,
+ * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have
+ * completed before this function is called; i.e. this function is called for
+ * all previous lwbs before it's called for "this" lwb (enforced via zio the
+ * dependencies configured in zil_lwb_set_zio_dependency()).
*
- * The intention is for this function to be called as soon as the
- * contents of an lwb are considered "stable" on disk, and will survive
- * any sudden loss of power. At this point, any threads waiting for the
- * lwb to reach this state are signalled, and the "waiter" structures
- * are marked "done".
+ * The intention is for this function to be called as soon as the contents of
+ * an lwb are considered "stable" on disk, and will survive any sudden loss of
+ * power. At this point, any threads waiting for the lwb to reach this state
+ * are signalled, and the "waiter" structures are marked "done".
*/
static void
zil_lwb_flush_vdevs_done(zio_t *zio)
{
lwb_t *lwb = zio->io_private;
zilog_t *zilog = lwb->lwb_zilog;
- dmu_tx_t *tx = lwb->lwb_tx;
zil_commit_waiter_t *zcw;
itx_t *itx;
spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
- zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
mutex_enter(&zilog->zl_lock);
- /*
- * Ensure the lwb buffer pointer is cleared before releasing the
- * txg. If we have had an allocation failure and the txg is
- * waiting to sync then we want zil_sync() to remove the lwb so
- * that it's not picked up as the next new one in
- * zil_process_commit_list(). zil_sync() will only remove the
- * lwb if lwb_buf is null.
- */
- lwb->lwb_buf = NULL;
- lwb->lwb_tx = NULL;
-
- ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
- zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
+ zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
lwb->lwb_root_zio = NULL;
@@ -1165,17 +1448,12 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
}
- while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
- list_remove(&lwb->lwb_itxs, itx);
+ while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
zil_itx_destroy(itx);
- }
- while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
+ while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
- ASSERT(list_link_active(&zcw->zcw_node));
- list_remove(&lwb->lwb_waiters, zcw);
-
ASSERT3P(zcw->zcw_lwb, ==, lwb);
zcw->zcw_lwb = NULL;
/*
@@ -1202,28 +1480,65 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
mutex_exit(&zcw->zcw_lock);
}
+ uint64_t txg = lwb->lwb_issued_txg;
+
+ /* Once we drop the lock, lwb may be freed by zil_sync(). */
mutex_exit(&zilog->zl_lock);
- /*
- * Now that we've written this log block, we have a stable pointer
- * to the next block in the chain, so it's OK to let the txg in
- * which we allocated the next block sync.
- */
- dmu_tx_commit(tx);
+ mutex_enter(&zilog->zl_lwb_io_lock);
+ ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
+ zilog->zl_lwb_inflight[txg & TXG_MASK]--;
+ if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
+ cv_broadcast(&zilog->zl_lwb_io_cv);
+ mutex_exit(&zilog->zl_lwb_io_lock);
}
/*
- * This is called when an lwb's write zio completes. The callback's
- * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
- * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
- * in writing out this specific lwb's data, and in the case that cache
- * flushes have been deferred, vdevs involved in writing the data for
- * previous lwbs. The writes corresponding to all the vdevs in the
- * lwb_vdev_tree will have completed by the time this is called, due to
- * the zio dependencies configured in zil_lwb_set_zio_dependency(),
- * which takes deferred flushes into account. The lwb will be "done"
- * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
- * completion callback for the lwb's root zio.
+ * Wait for the completion of all issued write/flush of that txg provided.
+ * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
+ */
+static void
+zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
+{
+ ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
+
+ mutex_enter(&zilog->zl_lwb_io_lock);
+ while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
+ cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
+ mutex_exit(&zilog->zl_lwb_io_lock);
+
+#ifdef ZFS_DEBUG
+ mutex_enter(&zilog->zl_lock);
+ mutex_enter(&zilog->zl_lwb_io_lock);
+ lwb_t *lwb = list_head(&zilog->zl_lwb_list);
+ while (lwb != NULL) {
+ if (lwb->lwb_issued_txg <= txg) {
+ ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
+ ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
+ IMPLY(lwb->lwb_issued_txg > 0,
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+ }
+ IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+ lwb->lwb_state == LWB_STATE_FLUSH_DONE,
+ lwb->lwb_buf == NULL);
+ lwb = list_next(&zilog->zl_lwb_list, lwb);
+ }
+ mutex_exit(&zilog->zl_lwb_io_lock);
+ mutex_exit(&zilog->zl_lock);
+#endif
+}
+
+/*
+ * This is called when an lwb's write zio completes. The callback's purpose is
+ * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The
+ * tree will contain the vdevs involved in writing out this specific lwb's
+ * data, and in the case that cache flushes have been deferred, vdevs involved
+ * in writing the data for previous lwbs. The writes corresponding to all the
+ * vdevs in the lwb_vdev_tree will have completed by the time this is called,
+ * due to the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done" once
+ * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion
+ * callback for the lwb's root zio.
*/
static void
zil_lwb_write_done(zio_t *zio)
@@ -1238,22 +1553,25 @@ zil_lwb_write_done(zio_t *zio)
ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
- ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
- ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
- ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
- ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
- ASSERT(!BP_IS_GANG(zio->io_bp));
- ASSERT(!BP_IS_HOLE(zio->io_bp));
- ASSERT(BP_GET_FILL(zio->io_bp) == 0);
-
abd_free(zio->io_abd);
+ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+ lwb->lwb_buf = NULL;
mutex_enter(&zilog->zl_lock);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
lwb->lwb_state = LWB_STATE_WRITE_DONE;
+ lwb->lwb_child_zio = NULL;
lwb->lwb_write_zio = NULL;
- lwb->lwb_fastwrite = FALSE;
+
+ /*
+ * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
+ * called for it yet, and when it will be, it won't be able to make
+ * its write ZIO a parent this ZIO. In such case we can not defer
+ * our flushes or below may be a race between the done callbacks.
+ */
nlwb = list_next(&zilog->zl_lwb_list, lwb);
+ if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
+ nlwb = NULL;
mutex_exit(&zilog->zl_lock);
if (avl_numnodes(t) == 0)
@@ -1279,21 +1597,20 @@ zil_lwb_write_done(zio_t *zio)
}
/*
- * If this lwb does not have any threads waiting for it to
- * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
- * command to the vdevs written to by "this" lwb, and instead
- * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
- * command for those vdevs. Thus, we merge the vdev tree of
- * "this" lwb with the vdev tree of the "next" lwb in the list,
- * and assume the "next" lwb will handle flushing the vdevs (or
- * deferring the flush(s) again).
+ * If this lwb does not have any threads waiting for it to complete, we
+ * want to defer issuing the flush command to the vdevs written to by
+ * "this" lwb, and instead rely on the "next" lwb to handle the flush
+ * command for those vdevs. Thus, we merge the vdev tree of "this" lwb
+ * with the vdev tree of the "next" lwb in the list, and assume the
+ * "next" lwb will handle flushing the vdevs (or deferring the flush(s)
+ * again).
*
- * This is a useful performance optimization, especially for
- * workloads with lots of async write activity and few sync
- * write and/or fsync activity, as it has the potential to
- * coalesce multiple flush commands to a vdev into one.
+ * This is a useful performance optimization, especially for workloads
+ * with lots of async write activity and few sync write and/or fsync
+ * activity, as it has the potential to coalesce multiple flush
+ * commands to a vdev into one.
*/
- if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
zil_lwb_flush_defer(lwb, nlwb);
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
return;
@@ -1316,338 +1633,487 @@ zil_lwb_write_done(zio_t *zio)
}
}
+/*
+ * Build the zio dependency chain, which is used to preserve the ordering of
+ * lwb completions that is required by the semantics of the ZIL. Each new lwb
+ * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
+ * cannot complete until the previous lwb's zio completes.
+ *
+ * This is required by the semantics of zil_commit(): the commit waiters
+ * attached to the lwbs will be woken in the lwb zio's completion callback,
+ * so this zio dependency graph ensures the waiters are woken in the correct
+ * order (the same order the lwbs were created).
+ */
static void
zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
{
- lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
-
- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT(MUTEX_HELD(&zilog->zl_lock));
+ lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
+ if (prev_lwb == NULL ||
+ prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ return;
+
/*
- * The zilog's "zl_last_lwb_opened" field is used to build the
- * lwb/zio dependency chain, which is used to preserve the
- * ordering of lwb completions that is required by the semantics
- * of the ZIL. Each new lwb zio becomes a parent of the
- * "previous" lwb zio, such that the new lwb's zio cannot
- * complete until the "previous" lwb's zio completes.
+ * If the previous lwb's write hasn't already completed, we also want
+ * to order the completion of the lwb write zios (above, we only order
+ * the completion of the lwb root zios). This is required because of
+ * how we can defer the flush commands for each lwb.
*
- * This is required by the semantics of zil_commit(); the commit
- * waiters attached to the lwbs will be woken in the lwb zio's
- * completion callback, so this zio dependency graph ensures the
- * waiters are woken in the correct order (the same order the
- * lwbs were created).
- */
- if (last_lwb_opened != NULL &&
- last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
- ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
- last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
- last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
-
- ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
- zio_add_child(lwb->lwb_root_zio,
- last_lwb_opened->lwb_root_zio);
-
- /*
- * If the previous lwb's write hasn't already completed,
- * we also want to order the completion of the lwb write
- * zios (above, we only order the completion of the lwb
- * root zios). This is required because of how we can
- * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
- *
- * When the DKIOCFLUSHWRITECACHE commands are deferred,
- * the previous lwb will rely on this lwb to flush the
- * vdevs written to by that previous lwb. Thus, we need
- * to ensure this lwb doesn't issue the flush until
- * after the previous lwb's write completes. We ensure
- * this ordering by setting the zio parent/child
- * relationship here.
- *
- * Without this relationship on the lwb's write zio,
- * it's possible for this lwb's write to complete prior
- * to the previous lwb's write completing; and thus, the
- * vdevs for the previous lwb would be flushed prior to
- * that lwb's data being written to those vdevs (the
- * vdevs are flushed in the lwb write zio's completion
- * handler, zil_lwb_write_done()).
- */
- if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
- ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
- last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
-
- ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
- zio_add_child(lwb->lwb_write_zio,
- last_lwb_opened->lwb_write_zio);
- }
+ * When the flush commands are deferred, the previous lwb will rely on
+ * this lwb to flush the vdevs written to by that previous lwb. Thus,
+ * we need to ensure this lwb doesn't issue the flush until after the
+ * previous lwb's write completes. We ensure this ordering by setting
+ * the zio parent/child relationship here.
+ *
+ * Without this relationship on the lwb's write zio, it's possible for
+ * this lwb's write to complete prior to the previous lwb's write
+ * completing; and thus, the vdevs for the previous lwb would be
+ * flushed prior to that lwb's data being written to those vdevs (the
+ * vdevs are flushed in the lwb write zio's completion handler,
+ * zil_lwb_write_done()).
+ */
+ if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
+ ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
+ zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
+ } else {
+ ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
}
+
+ ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
+ zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
}
/*
* This function's purpose is to "open" an lwb such that it is ready to
- * accept new itxs being committed to it. To do this, the lwb's zio
- * structures are created, and linked to the lwb. This function is
- * idempotent; if the passed in lwb has already been opened, this
- * function is essentially a no-op.
+ * accept new itxs being committed to it. This function is idempotent; if
+ * the passed in lwb has already been opened, it is essentially a no-op.
*/
static void
zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
{
- zbookmark_phys_t zb;
- zio_priority_t prio;
-
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
- ASSERT3P(lwb, !=, NULL);
- EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
- EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
- SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
- ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
- lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+ if (lwb->lwb_state != LWB_STATE_NEW) {
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ return;
+ }
- /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
mutex_enter(&zilog->zl_lock);
- if (lwb->lwb_root_zio == NULL) {
- abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
- BP_GET_LSIZE(&lwb->lwb_blk));
+ lwb->lwb_state = LWB_STATE_OPENED;
+ zilog->zl_last_lwb_opened = lwb;
+ mutex_exit(&zilog->zl_lock);
+}
- if (!lwb->lwb_fastwrite) {
- metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
- lwb->lwb_fastwrite = 1;
- }
+/*
+ * Maximum block size used by the ZIL. This is picked up when the ZIL is
+ * initialized. Otherwise this should not be used directly; see
+ * zl_max_block_size instead.
+ */
+static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
- if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
- prio = ZIO_PRIORITY_SYNC_WRITE;
- else
- prio = ZIO_PRIORITY_ASYNC_WRITE;
+/*
+ * Plan splitting of the provided burst size between several blocks.
+ */
+static uint_t
+zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
+{
+ uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
- lwb->lwb_root_zio = zio_root(zilog->zl_spa,
- zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
- ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+ if (size <= md) {
+ /*
+ * Small bursts are written as-is in one block.
+ */
+ *minsize = size;
+ return (size);
+ } else if (size > 8 * md) {
+ /*
+ * Big bursts use maximum blocks. The first block size
+ * is hard to predict, but it does not really matter.
+ */
+ *minsize = 0;
+ return (md);
+ }
- lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
- zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
- BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
- prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
- ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+ /*
+ * Medium bursts try to divide evenly to better utilize several SLOG
+ * VDEVs. The first block size we predict assuming the worst case of
+ * maxing out others. Fall back to using maximum blocks if due to
+ * large records or wasted space we can not predict anything better.
+ */
+ uint_t s = size;
+ uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
+ uint_t chunk = DIV_ROUND_UP(s, n);
+ uint_t waste = zil_max_waste_space(zilog);
+ waste = MAX(waste, zilog->zl_cur_max);
+ if (chunk <= md - waste) {
+ *minsize = MAX(s - (md - waste) * (n - 1), waste);
+ return (chunk);
+ } else {
+ *minsize = 0;
+ return (md);
+ }
+}
- lwb->lwb_state = LWB_STATE_OPENED;
+/*
+ * Try to predict next block size based on previous history. Make prediction
+ * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is
+ * less then 50%, extra writes may cost more, but we don't want single spike
+ * to badly affect our predictions.
+ */
+static uint_t
+zil_lwb_predict(zilog_t *zilog)
+{
+ uint_t m, o;
- zil_lwb_set_zio_dependency(zilog, lwb);
- zilog->zl_last_lwb_opened = lwb;
+ /* If we are in the middle of a burst, take it into account also. */
+ if (zilog->zl_cur_size > 0) {
+ o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
+ } else {
+ o = UINT_MAX;
+ m = 0;
}
- mutex_exit(&zilog->zl_lock);
- ASSERT3P(lwb->lwb_root_zio, !=, NULL);
- ASSERT3P(lwb->lwb_write_zio, !=, NULL);
- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ /* Find minimum optimal size. We don't need to go below that. */
+ for (int i = 0; i < ZIL_BURSTS; i++)
+ o = MIN(o, zilog->zl_prev_opt[i]);
+
+ /* Find two biggest minimal first block sizes above the optimal. */
+ uint_t m1 = MAX(m, o), m2 = o;
+ for (int i = 0; i < ZIL_BURSTS; i++) {
+ m = zilog->zl_prev_min[i];
+ if (m >= m1) {
+ m2 = m1;
+ m1 = m;
+ } else if (m > m2) {
+ m2 = m;
+ }
+ }
+
+ /*
+ * If second minimum size gives 50% saving -- use it. It may cost us
+ * one additional write later, but the space saving is just too big.
+ */
+ return ((m1 < m2 * 2) ? m1 : m2);
}
/*
- * Define a limited set of intent log block sizes.
- *
- * These must be a multiple of 4KB. Note only the amount used (again
- * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
+ * Close the log block for being issued and allocate the next one.
+ * Has to be called under zl_issuer_lock to chain more lwbs.
*/
-struct {
- uint64_t limit;
- uint64_t blksz;
-} zil_block_buckets[] = {
- { 4096, 4096 }, /* non TX_WRITE */
- { 8192 + 4096, 8192 + 4096 }, /* database */
- { 32768 + 4096, 32768 + 4096 }, /* NFS writes */
- { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */
- { 131072, 131072 }, /* < 128KB writes */
- { 131072 +4096, 65536 + 4096 }, /* 128KB writes */
- { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */
-};
+static lwb_t *
+zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
+{
+ uint64_t blksz, plan, plan2;
-/*
- * Maximum block size used by the ZIL. This is picked up when the ZIL is
- * initialized. Otherwise this should not be used directly; see
- * zl_max_block_size instead.
- */
-int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
+ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ lwb->lwb_state = LWB_STATE_CLOSED;
+
+ /*
+ * If there was an allocation failure then returned NULL will trigger
+ * zil_commit_writer_stall() at the caller. This is inherently racy,
+ * since allocation may not have happened yet.
+ */
+ if (lwb->lwb_error != 0)
+ return (NULL);
+
+ /*
+ * Log blocks are pre-allocated. Here we select the size of the next
+ * block, based on what's left of this burst and the previous history.
+ * While we try to only write used part of the block, we can't just
+ * always allocate the maximum block size because we can exhaust all
+ * available pool log space, so we try to be reasonable.
+ */
+ if (zilog->zl_cur_left > 0) {
+ /*
+ * We are in the middle of a burst and know how much is left.
+ * But if workload is multi-threaded there may be more soon.
+ * Try to predict what can it be and plan for the worst case.
+ */
+ uint_t m;
+ plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
+ if (zilog->zl_parallel) {
+ plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
+ zil_lwb_predict(zilog), &m);
+ if (plan < plan2)
+ plan = plan2;
+ }
+ } else {
+ /*
+ * The previous burst is done and we can only predict what
+ * will come next.
+ */
+ plan = zil_lwb_predict(zilog);
+ }
+ blksz = plan + sizeof (zil_chain_t);
+ blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
+ blksz = MIN(blksz, zilog->zl_max_block_size);
+ DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
+ uint64_t, plan);
+
+ return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
+}
/*
- * Start a log block write and advance to the next log block.
- * Calls are serialized.
+ * Finalize previously closed block and issue the write zio.
*/
-static lwb_t *
+static void
zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
{
- lwb_t *nlwb = NULL;
- zil_chain_t *zilc;
spa_t *spa = zilog->zl_spa;
- blkptr_t *bp;
- dmu_tx_t *tx;
- uint64_t txg;
- uint64_t zil_blksz, wsz;
- int i, error;
+ zil_chain_t *zilc;
boolean_t slog;
+ zbookmark_phys_t zb;
+ zio_priority_t prio;
+ int error;
- ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
- ASSERT3P(lwb->lwb_root_zio, !=, NULL);
- ASSERT3P(lwb->lwb_write_zio, !=, NULL);
- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
- if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
- zilc = (zil_chain_t *)lwb->lwb_buf;
- bp = &zilc->zc_next_blk;
- } else {
- zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
- bp = &zilc->zc_next_blk;
- }
+ /* Actually fill the lwb with the data. */
+ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
+ itx = list_next(&lwb->lwb_itxs, itx))
+ zil_lwb_commit(zilog, lwb, itx);
+ lwb->lwb_nused = lwb->lwb_nfilled;
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
- ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
+ lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
+ ZIO_FLAG_CANFAIL);
/*
- * Allocate the next block and save its address in this block
- * before writing it in order to establish the log chain.
- * Note that if the allocation of nlwb synced before we wrote
- * the block that points at it (lwb), we'd leak it if we crashed.
- * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
- * We dirty the dataset to ensure that zil_sync() will be called
- * to clean up in the event of allocation failure or I/O failure.
+ * The lwb is now ready to be issued, but it can be only if it already
+ * got its block pointer allocated or the allocation has failed.
+ * Otherwise leave it as-is, relying on some other thread to issue it
+ * after allocating its block pointer via calling zil_lwb_write_issue()
+ * for the previous lwb(s) in the chain.
*/
+ mutex_enter(&zilog->zl_lock);
+ lwb->lwb_state = LWB_STATE_READY;
+ if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ mutex_exit(&zilog->zl_lock);
- tx = dmu_tx_create(zilog->zl_os);
+next_lwb:
+ if (lwb->lwb_slim)
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ else
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
+ int wsz = lwb->lwb_sz;
+ if (lwb->lwb_error == 0) {
+ abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
+ if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
+ prio = ZIO_PRIORITY_SYNC_WRITE;
+ else
+ prio = ZIO_PRIORITY_ASYNC_WRITE;
+ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+ lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
+ &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
+ lwb, prio, ZIO_FLAG_CANFAIL, &zb);
+ zil_lwb_add_block(lwb, &lwb->lwb_blk);
+
+ if (lwb->lwb_slim) {
+ /* For Slim ZIL only write what is used. */
+ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
+ int);
+ ASSERT3S(wsz, <=, lwb->lwb_sz);
+ zio_shrink(lwb->lwb_write_zio, wsz);
+ wsz = lwb->lwb_write_zio->io_size;
+ }
+ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
+ zilc->zc_pad = 0;
+ zilc->zc_nused = lwb->lwb_nused;
+ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+ } else {
+ /*
+ * We can't write the lwb if there was an allocation failure,
+ * so create a null zio instead just to maintain dependencies.
+ */
+ lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
+ zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
+ lwb->lwb_write_zio->io_error = lwb->lwb_error;
+ }
+ if (lwb->lwb_child_zio)
+ zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
/*
- * Since we are not going to create any new dirty data, and we
- * can even help with clearing the existing dirty data, we
- * should not be subject to the dirty data based delays. We
- * use TXG_NOTHROTTLE to bypass the delay mechanism.
+ * Open transaction to allocate the next block pointer.
*/
+ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
-
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- txg = dmu_tx_get_txg(tx);
-
- lwb->lwb_tx = tx;
+ uint64_t txg = dmu_tx_get_txg(tx);
/*
- * Log blocks are pre-allocated. Here we select the size of the next
- * block, based on size used in the last block.
- * - first find the smallest bucket that will fit the block from a
- * limited set of block sizes. This is because it's faster to write
- * blocks allocated from the same metaslab as they are adjacent or
- * close.
- * - next find the maximum from the new suggested size and an array of
- * previous sizes. This lessens a picket fence effect of wrongly
- * guessing the size if we have a stream of say 2k, 64k, 2k, 64k
- * requests.
- *
- * Note we only write what is used, but we can't just allocate
- * the maximum block size because we can exhaust the available
- * pool log space.
- */
- zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
- for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
- continue;
- zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
- zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
- for (i = 0; i < ZIL_PREV_BLKS; i++)
- zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
- zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
-
+ * Allocate next the block pointer unless we are already in error.
+ */
+ lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
+ blkptr_t *bp = &zilc->zc_next_blk;
BP_ZERO(bp);
- error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
- if (slog) {
- ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
- ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
- } else {
- ZIL_STAT_BUMP(zil_itx_metaslab_normal_count);
- ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused);
+ error = lwb->lwb_error;
+ if (error == 0) {
+ error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
+ &slog);
}
if (error == 0) {
- ASSERT3U(bp->blk_birth, ==, txg);
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
+ BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
+ ZIO_CHECKSUM_ZILOG);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
-
- /*
- * Allocate a new log write block (lwb).
- */
- nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
}
- if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
- /* For Slim ZIL only write what is used. */
- wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
- ASSERT3U(wsz, <=, lwb->lwb_sz);
- zio_shrink(lwb->lwb_write_zio, wsz);
-
- } else {
- wsz = lwb->lwb_sz;
- }
+ /*
+ * Reduce TXG open time by incrementing inflight counter and committing
+ * the transaciton. zil_sync() will wait for it to return to zero.
+ */
+ mutex_enter(&zilog->zl_lwb_io_lock);
+ lwb->lwb_issued_txg = txg;
+ zilog->zl_lwb_inflight[txg & TXG_MASK]++;
+ zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
+ mutex_exit(&zilog->zl_lwb_io_lock);
+ dmu_tx_commit(tx);
- zilc->zc_pad = 0;
- zilc->zc_nused = lwb->lwb_nused;
- zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+ spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
/*
- * clear unused data for security
+ * We've completed all potentially blocking operations. Update the
+ * nlwb and allow it proceed without possible lock order reversals.
*/
- bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
+ mutex_enter(&zilog->zl_lock);
+ zil_lwb_set_zio_dependency(zilog, lwb);
+ lwb->lwb_state = LWB_STATE_ISSUED;
- spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+ if (nlwb) {
+ nlwb->lwb_blk = *bp;
+ nlwb->lwb_error = error;
+ nlwb->lwb_slog = slog;
+ nlwb->lwb_alloc_txg = txg;
+ if (nlwb->lwb_state != LWB_STATE_READY)
+ nlwb = NULL;
+ }
+ mutex_exit(&zilog->zl_lock);
- zil_lwb_add_block(lwb, &lwb->lwb_blk);
+ if (lwb->lwb_slog) {
+ ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
+ lwb->lwb_nused);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
+ wsz);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
+ BP_GET_LSIZE(&lwb->lwb_blk));
+ } else {
+ ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
+ lwb->lwb_nused);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
+ wsz);
+ ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
+ BP_GET_LSIZE(&lwb->lwb_blk));
+ }
lwb->lwb_issued_timestamp = gethrtime();
- lwb->lwb_state = LWB_STATE_ISSUED;
-
- zio_nowait(lwb->lwb_root_zio);
+ if (lwb->lwb_child_zio)
+ zio_nowait(lwb->lwb_child_zio);
zio_nowait(lwb->lwb_write_zio);
+ zio_nowait(lwb->lwb_root_zio);
/*
- * If there was an allocation failure then nlwb will be null which
- * forces a txg_wait_synced().
+ * If nlwb was ready when we gave it the block pointer,
+ * it is on us to issue it and possibly following ones.
*/
- return (nlwb);
+ lwb = nlwb;
+ if (lwb)
+ goto next_lwb;
}
/*
- * Maximum amount of write data that can be put into single log block.
+ * Maximum amount of data that can be put into single log block.
*/
uint64_t
-zil_max_log_data(zilog_t *zilog)
+zil_max_log_data(zilog_t *zilog, size_t hdrsize)
{
- return (zilog->zl_max_block_size -
- sizeof (zil_chain_t) - sizeof (lr_write_t));
+ return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
}
/*
* Maximum amount of log space we agree to waste to reduce number of
- * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
*/
static inline uint64_t
zil_max_waste_space(zilog_t *zilog)
{
- return (zil_max_log_data(zilog) / 8);
+ return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
}
/*
* Maximum amount of write data for WR_COPIED. For correctness, consumers
* must fall back to WR_NEED_COPY if we can't fit the entire record into one
* maximum sized log block, because each WR_COPIED record must fit in a
- * single log block. For space efficiency, we want to fit two records into a
- * max-sized log block.
+ * single log block. Below that it is a tradeoff of additional memory copy
+ * and possibly worse log space efficiency vs additional range lock/unlock.
*/
+static uint_t zil_maxcopied = 7680;
+
uint64_t
zil_max_copied_data(zilog_t *zilog)
{
- return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
- sizeof (lr_write_t));
+ uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
+ return (MIN(max_data, zil_maxcopied));
}
+static uint64_t
+zil_itx_record_size(itx_t *itx)
+{
+ lr_t *lr = &itx->itx_lr;
+
+ if (lr->lrc_txtype == TX_COMMIT)
+ return (0);
+ ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+ return (lr->lrc_reclen);
+}
+
+static uint64_t
+zil_itx_data_size(itx_t *itx)
+{
+ lr_t *lr = &itx->itx_lr;
+ lr_write_t *lrw = (lr_write_t *)lr;
+
+ if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+ ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
+ return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
+ uint64_t));
+ }
+ return (0);
+}
+
+static uint64_t
+zil_itx_full_size(itx_t *itx)
+{
+ lr_t *lr = &itx->itx_lr;
+
+ if (lr->lrc_txtype == TX_COMMIT)
+ return (0);
+ ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
+ return (lr->lrc_reclen + zil_itx_data_size(itx));
+}
+
+/*
+ * Estimate space needed in the lwb for the itx. Allocate more lwbs or
+ * split the itx as needed, but don't touch the actual transaction data.
+ * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
+ * to chain more lwbs.
+ */
static lwb_t *
-zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
{
- lr_t *lrcb, *lrc;
- lr_write_t *lrwb, *lrw;
- char *lr_buf;
- uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data;
+ itx_t *citx;
+ lr_t *lr, *clr;
+ lr_write_t *lrw;
+ uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
@@ -1655,8 +2121,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
zil_lwb_write_open(zilog, lwb);
- lrc = &itx->itx_lr;
- lrw = (lr_write_t *)lrc;
+ lr = &itx->itx_lr;
+ lrw = (lr_write_t *)lr;
/*
* A commit itx doesn't represent any on-disk state; instead
@@ -1670,150 +2136,207 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
*
* For more details, see the comment above zil_commit().
*/
- if (lrc->lrc_txtype == TX_COMMIT) {
- mutex_enter(&zilog->zl_lock);
+ if (lr->lrc_txtype == TX_COMMIT) {
zil_commit_waiter_link_lwb(itx->itx_private, lwb);
- itx->itx_private = NULL;
- mutex_exit(&zilog->zl_lock);
+ list_insert_tail(&lwb->lwb_itxs, itx);
return (lwb);
}
- if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
- dlen = P2ROUNDUP_TYPED(
- lrw->lr_length, sizeof (uint64_t), uint64_t);
- dpad = dlen - lrw->lr_length;
- } else {
- dlen = dpad = 0;
- }
- reclen = lrc->lrc_reclen;
- zilog->zl_cur_used += (reclen + dlen);
- txg = lrc->lrc_txg;
-
- ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
+ reclen = lr->lrc_reclen;
+ ASSERT3U(reclen, >=, sizeof (lr_t));
+ ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
+ dlen = zil_itx_data_size(itx);
cont:
/*
* If this record won't fit in the current log block, start a new one.
* For WR_NEED_COPY optimize layout for minimal number of chunks.
*/
- lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
- max_log_data = zil_max_log_data(zilog);
+ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
+ max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) {
- lwb = zil_lwb_write_issue(zilog, lwb);
+ list_insert_tail(ilwbs, lwb);
+ lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
if (lwb == NULL)
return (NULL);
- zil_lwb_write_open(zilog, lwb);
- ASSERT(LWB_EMPTY(lwb));
- lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
-
- /*
- * There must be enough space in the new, empty log block to
- * hold reclen. For WR_COPIED, we need to fit the whole
- * record in one block, and reclen is the header size + the
- * data size. For WR_NEED_COPY, we can create multiple
- * records, splitting the data into multiple blocks, so we
- * only need to fit one word of data per block; in this case
- * reclen is just the header size (no data).
- */
- ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
}
+ /*
+ * There must be enough space in the log block to hold reclen.
+ * For WR_COPIED, we need to fit the whole record in one block,
+ * and reclen is the write record header size + the data size.
+ * For WR_NEED_COPY, we can create multiple records, splitting
+ * the data into multiple blocks, so we only need to fit one
+ * word of data per block; in this case reclen is just the header
+ * size (no data).
+ */
+ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+
dnow = MIN(dlen, lwb_sp - reclen);
- lr_buf = lwb->lwb_buf + lwb->lwb_nused;
- bcopy(lrc, lr_buf, reclen);
- lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
- lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
+ if (dlen > dnow) {
+ ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
+ ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
+ citx = zil_itx_clone(itx);
+ clr = &citx->itx_lr;
+ lr_write_t *clrw = (lr_write_t *)clr;
+ clrw->lr_length = dnow;
+ lrw->lr_offset += dnow;
+ lrw->lr_length -= dnow;
+ zilog->zl_cur_left -= dnow;
+ } else {
+ citx = itx;
+ clr = lr;
+ }
+
+ /*
+ * We're actually making an entry, so update lrc_seq to be the
+ * log record sequence number. Note that this is generally not
+ * equal to the itx sequence number because not all transactions
+ * are synchronous, and sometimes spa_sync() gets there first.
+ */
+ clr->lrc_seq = ++zilog->zl_lr_seq;
+
+ lwb->lwb_nused += reclen + dnow;
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
+ ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+
+ zil_lwb_add_txg(lwb, lr->lrc_txg);
+ list_insert_tail(&lwb->lwb_itxs, citx);
+
+ dlen -= dnow;
+ if (dlen > 0)
+ goto cont;
+
+ if (lr->lrc_txtype == TX_WRITE &&
+ lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
+
+ return (lwb);
+}
+
+/*
+ * Fill the actual transaction data into the lwb, following zil_lwb_assign().
+ * Does not require locking.
+ */
+static void
+zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
+{
+ lr_t *lr, *lrb;
+ lr_write_t *lrw, *lrwb;
+ char *lr_buf;
+ uint64_t dlen, reclen;
+
+ lr = &itx->itx_lr;
+ lrw = (lr_write_t *)lr;
- ZIL_STAT_BUMP(zil_itx_count);
+ if (lr->lrc_txtype == TX_COMMIT)
+ return;
+
+ reclen = lr->lrc_reclen;
+ dlen = zil_itx_data_size(itx);
+ ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
+
+ lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
+ memcpy(lr_buf, lr, reclen);
+ lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */
+ lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */
+
+ ZIL_STAT_BUMP(zilog, zil_itx_count);
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
- if (lrc->lrc_txtype == TX_WRITE) {
- if (txg > spa_freeze_txg(zilog->zl_spa))
- txg_wait_synced(zilog->zl_dmu_pool, txg);
+ if (lr->lrc_txtype == TX_WRITE) {
if (itx->itx_wr_state == WR_COPIED) {
- ZIL_STAT_BUMP(zil_itx_copied_count);
- ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length);
+ ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
+ ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
+ lrw->lr_length);
} else {
char *dbuf;
int error;
if (itx->itx_wr_state == WR_NEED_COPY) {
dbuf = lr_buf + reclen;
- lrcb->lrc_reclen += dnow;
- if (lrwb->lr_length > dnow)
- lrwb->lr_length = dnow;
- lrw->lr_offset += dnow;
- lrw->lr_length -= dnow;
- ZIL_STAT_BUMP(zil_itx_needcopy_count);
- ZIL_STAT_INCR(zil_itx_needcopy_bytes, dnow);
+ lrb->lrc_reclen += dlen;
+ ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
+ ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
+ dlen);
} else {
ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
dbuf = NULL;
- ZIL_STAT_BUMP(zil_itx_indirect_count);
- ZIL_STAT_INCR(zil_itx_indirect_bytes,
+ ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
+ ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
lrw->lr_length);
+ if (lwb->lwb_child_zio == NULL) {
+ lwb->lwb_child_zio = zio_null(NULL,
+ zilog->zl_spa, NULL, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
}
/*
- * We pass in the "lwb_write_zio" rather than
- * "lwb_root_zio" so that the "lwb_write_zio"
- * becomes the parent of any zio's created by
- * the "zl_get_data" callback. The vdevs are
- * flushed after the "lwb_write_zio" completes,
- * so we want to make sure that completion
- * callback waits for these additional zio's,
- * such that the vdevs used by those zio's will
- * be included in the lwb's vdev tree, and those
- * vdevs will be properly flushed. If we passed
- * in "lwb_root_zio" here, then these additional
- * vdevs may not be flushed; e.g. if these zio's
- * completed after "lwb_write_zio" completed.
+ * The "lwb_child_zio" we pass in will become a child of
+ * "lwb_write_zio", when one is created, so one will be
+ * a parent of any zio's created by the "zl_get_data".
+ * This way "lwb_write_zio" will first wait for children
+ * block pointers before own writing, and then for their
+ * writing completion before the vdev cache flushing.
*/
error = zilog->zl_get_data(itx->itx_private,
itx->itx_gen, lrwb, dbuf, lwb,
- lwb->lwb_write_zio);
- if (dbuf != NULL && error == 0 && dnow == dlen)
+ lwb->lwb_child_zio);
+ if (dbuf != NULL && error == 0) {
/* Zero any padding bytes in the last block. */
- bzero((char *)dbuf + lrwb->lr_length, dpad);
-
- if (error == EIO) {
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- return (lwb);
+ memset((char *)dbuf + lrwb->lr_length, 0,
+ dlen - lrwb->lr_length);
}
- if (error != 0) {
- ASSERT(error == ENOENT || error == EEXIST ||
- error == EALREADY);
- return (lwb);
+
+ /*
+ * Typically, the only return values we should see from
+ * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
+ * EALREADY. However, it is also possible to see other
+ * error values such as ENOSPC or EINVAL from
+ * dmu_read() -> dnode_hold() -> dnode_hold_impl() or
+ * ENXIO as well as a multitude of others from the
+ * block layer through dmu_buf_hold() -> dbuf_read()
+ * -> zio_wait(), as well as through dmu_read() ->
+ * dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
+ * zio_wait(). When these errors happen, we can assume
+ * that neither an immediate write nor an indirect
+ * write occurred, so we need to fall back to
+ * txg_wait_synced(). This is unusual, so we print to
+ * dmesg whenever one of these errors occurs.
+ */
+ switch (error) {
+ case 0:
+ break;
+ default:
+ cmn_err(CE_WARN, "zil_lwb_commit() received "
+ "unexpected error %d from ->zl_get_data()"
+ ". Falling back to txg_wait_synced().",
+ error);
+ zfs_fallthrough;
+ case EIO:
+ txg_wait_synced(zilog->zl_dmu_pool,
+ lr->lrc_txg);
+ zfs_fallthrough;
+ case ENOENT:
+ zfs_fallthrough;
+ case EEXIST:
+ zfs_fallthrough;
+ case EALREADY:
+ return;
}
}
}
- /*
- * We're actually making an entry, so update lrc_seq to be the
- * log record sequence number. Note that this is generally not
- * equal to the itx sequence number because not all transactions
- * are synchronous, and sometimes spa_sync() gets there first.
- */
- lrcb->lrc_seq = ++zilog->zl_lr_seq;
- lwb->lwb_nused += reclen + dnow;
-
- zil_lwb_add_txg(lwb, txg);
-
- ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
- ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
-
- dlen -= dnow;
- if (dlen > 0) {
- zilog->zl_cur_used += reclen;
- goto cont;
- }
-
- return (lwb);
+ lwb->lwb_nfilled += reclen + dlen;
+ ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
+ ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
}
itx_t *
@@ -1822,14 +2345,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
size_t itxsize, lrsize;
itx_t *itx;
+ ASSERT3U(olrsize, >=, sizeof (lr_t));
lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
+ ASSERT3U(lrsize, >=, olrsize);
itxsize = offsetof(itx_t, itx_lr) + lrsize;
itx = zio_data_buf_alloc(itxsize);
itx->itx_lr.lrc_txtype = txtype;
itx->itx_lr.lrc_reclen = lrsize;
itx->itx_lr.lrc_seq = 0; /* defensive */
- bzero((char *)&itx->itx_lr + olrsize, lrsize - olrsize);
+ memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
itx->itx_sync = B_TRUE; /* default is synchronous */
itx->itx_callback = NULL;
itx->itx_callback_data = NULL;
@@ -1838,9 +2363,26 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
return (itx);
}
+static itx_t *
+zil_itx_clone(itx_t *oitx)
+{
+ ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
+ ASSERT3U(oitx->itx_size, ==,
+ offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
+
+ itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
+ memcpy(itx, oitx, oitx->itx_size);
+ itx->itx_callback = NULL;
+ itx->itx_callback_data = NULL;
+ return (itx);
+}
+
void
zil_itx_destroy(itx_t *itx)
{
+ ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
+ ASSERT3U(itx->itx_lr.lrc_reclen, ==,
+ itx->itx_size - offsetof(itx_t, itx_lr));
IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
@@ -1865,11 +2407,11 @@ zil_itxg_clean(void *arg)
itx_async_node_t *ian;
list = &itxs->i_sync_list;
- while ((itx = list_head(list)) != NULL) {
+ while ((itx = list_remove_head(list)) != NULL) {
/*
* In the general case, commit itxs will not be found
* here, as they'll be committed to an lwb via
- * zil_lwb_commit(), and free'd in that function. Having
+ * zil_lwb_assign(), and free'd in that function. Having
* said that, it is still possible for commit itxs to be
* found here, due to the following race:
*
@@ -1888,7 +2430,6 @@ zil_itxg_clean(void *arg)
if (itx->itx_lr.lrc_txtype == TX_COMMIT)
zil_commit_waiter_skip(itx->itx_private);
- list_remove(list, itx);
zil_itx_destroy(itx);
}
@@ -1896,8 +2437,7 @@ zil_itxg_clean(void *arg)
t = &itxs->i_async_tree;
while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
list = &ian->ia_list;
- while ((itx = list_head(list)) != NULL) {
- list_remove(list, itx);
+ while ((itx = list_remove_head(list)) != NULL) {
/* commit itxs should never be on the async lists. */
ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
zil_itx_destroy(itx);
@@ -1926,7 +2466,7 @@ void
zil_remove_async(zilog_t *zilog, uint64_t oid)
{
uint64_t otxg, txg;
- itx_async_node_t *ian;
+ itx_async_node_t *ian, ian_search;
avl_tree_t *t;
avl_index_t where;
list_t clean_list;
@@ -1953,13 +2493,13 @@ zil_remove_async(zilog_t *zilog, uint64_t oid)
* Locate the object node and append its list.
*/
t = &itxg->itxg_itxs->i_async_tree;
- ian = avl_find(t, &oid, &where);
+ ian_search.ia_foid = oid;
+ ian = avl_find(t, &ian_search, &where);
if (ian != NULL)
list_move_tail(&clean_list, &ian->ia_list);
mutex_exit(&itxg->itxg_lock);
}
- while ((itx = list_head(&clean_list)) != NULL) {
- list_remove(&clean_list, itx);
+ while ((itx = list_remove_head(&clean_list)) != NULL) {
/* commit itxs should never be on the async lists. */
ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
zil_itx_destroy(itx);
@@ -2090,10 +2630,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
* This function will traverse the queue of itxs that need to be
* committed, and move them onto the ZIL's zl_itx_commit_list.
*/
-static void
+static uint64_t
zil_get_commit_list(zilog_t *zilog)
{
- uint64_t otxg, txg;
+ uint64_t otxg, txg, wtxg = 0;
list_t *commit_list = &zilog->zl_itx_commit_list;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@@ -2127,10 +2667,33 @@ zil_get_commit_list(zilog_t *zilog)
*/
ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
- list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+ list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
+ itx_t *itx = NULL;
+ if (unlikely(zilog->zl_suspend > 0)) {
+ /*
+ * ZIL was just suspended, but we lost the race.
+ * Allow all earlier itxs to be committed, but ask
+ * caller to do txg_wait_synced(txg) for any new.
+ */
+ if (!list_is_empty(sync_list))
+ wtxg = MAX(wtxg, txg);
+ } else {
+ itx = list_head(sync_list);
+ list_move_tail(commit_list, sync_list);
+ }
mutex_exit(&itxg->itxg_lock);
+
+ while (itx != NULL) {
+ uint64_t s = zil_itx_full_size(itx);
+ zilog->zl_cur_size += s;
+ zilog->zl_cur_left += s;
+ s = zil_itx_record_size(itx);
+ zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
+ itx = list_next(commit_list, itx);
+ }
}
+ return (wtxg);
}
/*
@@ -2140,7 +2703,7 @@ void
zil_async_to_sync(zilog_t *zilog, uint64_t foid)
{
uint64_t otxg, txg;
- itx_async_node_t *ian;
+ itx_async_node_t *ian, ian_search;
avl_tree_t *t;
avl_index_t where;
@@ -2170,7 +2733,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid)
*/
t = &itxg->itxg_itxs->i_async_tree;
if (foid != 0) {
- ian = avl_find(t, &foid, &where);
+ ian_search.ia_foid = foid;
+ ian = avl_find(t, &ian_search, &where);
if (ian != NULL) {
list_move_tail(&itxg->itxg_itxs->i_sync_list,
&ian->ia_list);
@@ -2224,7 +2788,6 @@ zil_prune_commit_list(zilog_t *zilog)
zil_commit_waiter_skip(itx->itx_private);
} else {
zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
- itx->itx_private = NULL;
}
mutex_exit(&zilog->zl_lock);
@@ -2261,7 +2824,27 @@ zil_commit_writer_stall(zilog_t *zilog)
*/
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
txg_wait_synced(zilog->zl_dmu_pool, 0);
- ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+}
+
+static void
+zil_burst_done(zilog_t *zilog)
+{
+ if (!list_is_empty(&zilog->zl_itx_commit_list) ||
+ zilog->zl_cur_size == 0)
+ return;
+
+ if (zilog->zl_parallel)
+ zilog->zl_parallel--;
+
+ uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
+ zilog->zl_prev_rotor = r;
+ zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
+ &zilog->zl_prev_min[r]);
+
+ zilog->zl_cur_size = 0;
+ zilog->zl_cur_max = 0;
+ zilog->zl_cur_left = 0;
}
/*
@@ -2271,12 +2854,12 @@ zil_commit_writer_stall(zilog_t *zilog)
* lwb will be issued to the zio layer to be written to disk.
*/
static void
-zil_process_commit_list(zilog_t *zilog)
+zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
{
spa_t *spa = zilog->zl_spa;
list_t nolwb_itxs;
list_t nolwb_waiters;
- lwb_t *lwb;
+ lwb_t *lwb, *plwb;
itx_t *itx;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@@ -2285,7 +2868,7 @@ zil_process_commit_list(zilog_t *zilog)
* Return if there's nothing to commit before we dirty the fs by
* calling zil_create().
*/
- if (list_head(&zilog->zl_itx_commit_list) == NULL)
+ if (list_is_empty(&zilog->zl_itx_commit_list))
return;
list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
@@ -2296,12 +2879,32 @@ zil_process_commit_list(zilog_t *zilog)
if (lwb == NULL) {
lwb = zil_create(zilog);
} else {
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+ /*
+ * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
+ * have already been created (zl_lwb_list not empty).
+ */
+ zil_commit_activate_saxattr_feature(zilog);
+ ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+ lwb->lwb_state == LWB_STATE_OPENED);
+
+ /*
+ * If the lwb is still opened, it means the workload is really
+ * multi-threaded and we won the chance of write aggregation.
+ * If it is not opened yet, but previous lwb is still not
+ * flushed, it still means the workload is multi-threaded, but
+ * there was too much time between the commits to aggregate, so
+ * we try aggregation next times, but without too much hopes.
+ */
+ if (lwb->lwb_state == LWB_STATE_OPENED) {
+ zilog->zl_parallel = ZIL_BURSTS;
+ } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
+ != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
+ zilog->zl_parallel = MAX(zilog->zl_parallel,
+ ZIL_BURSTS / 2);
+ }
}
- while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+ while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
lr_t *lrc = &itx->itx_lr;
uint64_t txg = lrc->lrc_txg;
@@ -2315,8 +2918,6 @@ zil_process_commit_list(zilog_t *zilog)
zilog_t *, zilog, itx_t *, itx);
}
- list_remove(&zilog->zl_itx_commit_list, itx);
-
boolean_t synced = txg <= spa_last_synced_txg(spa);
boolean_t frozen = txg > spa_freeze_txg(spa);
@@ -2366,22 +2967,31 @@ zil_process_commit_list(zilog_t *zilog)
*/
if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
if (lwb != NULL) {
- lwb = zil_lwb_commit(zilog, itx, lwb);
-
- if (lwb == NULL)
+ lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
+ if (lwb == NULL) {
list_insert_tail(&nolwb_itxs, itx);
- else
- list_insert_tail(&lwb->lwb_itxs, itx);
+ } else if ((zcw->zcw_lwb != NULL &&
+ zcw->zcw_lwb != lwb) || zcw->zcw_done) {
+ /*
+ * Our lwb is done, leave the rest of
+ * itx list to somebody else who care.
+ */
+ zilog->zl_parallel = ZIL_BURSTS;
+ zilog->zl_cur_left -=
+ zil_itx_full_size(itx);
+ break;
+ }
} else {
if (lrc->lrc_txtype == TX_COMMIT) {
zil_commit_waiter_link_nolwb(
itx->itx_private, &nolwb_waiters);
}
-
list_insert_tail(&nolwb_itxs, itx);
}
+ zilog->zl_cur_left -= zil_itx_full_size(itx);
} else {
ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
+ zilog->zl_cur_left -= zil_itx_full_size(itx);
zil_itx_destroy(itx);
}
}
@@ -2393,6 +3003,8 @@ zil_process_commit_list(zilog_t *zilog)
* the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details.
*/
+ while ((lwb = list_remove_head(ilwbs)) != NULL)
+ zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
/*
@@ -2402,54 +3014,45 @@ zil_process_commit_list(zilog_t *zilog)
* normal.
*/
zil_commit_waiter_t *zcw;
- while ((zcw = list_head(&nolwb_waiters)) != NULL) {
+ while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
zil_commit_waiter_skip(zcw);
- list_remove(&nolwb_waiters, zcw);
- }
/*
* And finally, we have to destroy the itx's that
* couldn't be committed to an lwb; this will also call
* the itx's callback if one exists for the itx.
*/
- while ((itx = list_head(&nolwb_itxs)) != NULL) {
- list_remove(&nolwb_itxs, itx);
+ while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
zil_itx_destroy(itx);
- }
} else {
ASSERT(list_is_empty(&nolwb_waiters));
ASSERT3P(lwb, !=, NULL);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+ ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
+ lwb->lwb_state == LWB_STATE_OPENED);
/*
* At this point, the ZIL block pointed at by the "lwb"
- * variable is in one of the following states: "closed"
- * or "open".
+ * variable is in "new" or "opened" state.
*
- * If it's "closed", then no itxs have been committed to
- * it, so there's no point in issuing its zio (i.e. it's
- * "empty").
+ * If it's "new", then no itxs have been committed to it, so
+ * there's no point in issuing its zio (i.e. it's "empty").
*
- * If it's "open", then it contains one or more itxs that
+ * If it's "opened", then it contains one or more itxs that
* eventually need to be committed to stable storage. In
* this case we intentionally do not issue the lwb's zio
* to disk yet, and instead rely on one of the following
* two mechanisms for issuing the zio:
*
- * 1. Ideally, there will be more ZIL activity occurring
- * on the system, such that this function will be
- * immediately called again (not necessarily by the same
- * thread) and this lwb's zio will be issued via
- * zil_lwb_commit(). This way, the lwb is guaranteed to
- * be "full" when it is issued to disk, and we'll make
- * use of the lwb's size the best we can.
+ * 1. Ideally, there will be more ZIL activity occurring on
+ * the system, such that this function will be immediately
+ * called again by different thread and this lwb will be
+ * closed by zil_lwb_assign(). This way, the lwb will be
+ * "full" when it is issued to disk, and we'll make use of
+ * the lwb's size the best we can.
*
* 2. If there isn't sufficient ZIL activity occurring on
- * the system, such that this lwb's zio isn't issued via
- * zil_lwb_commit(), zil_commit_waiter() will issue the
- * lwb's zio. If this occurs, the lwb is not guaranteed
+ * the system, zil_commit_waiter() will close it and issue
+ * the zio. If this occurs, the lwb is not guaranteed
* to be "full" by the time its zio is issued, and means
* the size of the lwb was "too large" given the amount
* of ZIL activity occurring on the system at that time.
@@ -2470,6 +3073,16 @@ zil_process_commit_list(zilog_t *zilog)
* possible, without significantly impacting the latency
* of each individual itx.
*/
+ if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+ zil_burst_done(zilog);
+ list_insert_tail(ilwbs, lwb);
+ lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
+ if (lwb == NULL) {
+ while ((lwb = list_remove_head(ilwbs)) != NULL)
+ zil_lwb_write_issue(zilog, lwb);
+ zil_commit_writer_stall(zilog);
+ }
+ }
}
}
@@ -2487,12 +3100,17 @@ zil_process_commit_list(zilog_t *zilog)
* not issued, we rely on future calls to zil_commit_writer() to issue
* the lwb, or the timeout mechanism found in zil_commit_waiter().
*/
-static void
+static uint64_t
zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
+ list_t ilwbs;
+ lwb_t *lwb;
+ uint64_t wtxg = 0;
+
ASSERT(!MUTEX_HELD(&zilog->zl_lock));
ASSERT(spa_writeable(zilog->zl_spa));
+ list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
mutex_enter(&zilog->zl_issuer_lock);
if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
@@ -2515,14 +3133,18 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
goto out;
}
- ZIL_STAT_BUMP(zil_commit_writer_count);
+ ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
- zil_get_commit_list(zilog);
+ wtxg = zil_get_commit_list(zilog);
zil_prune_commit_list(zilog);
- zil_process_commit_list(zilog);
+ zil_process_commit_list(zilog, zcw, &ilwbs);
out:
mutex_exit(&zilog->zl_issuer_lock);
+ while ((lwb = list_remove_head(&ilwbs)) != NULL)
+ zil_lwb_write_issue(zilog, lwb);
+ list_destroy(&ilwbs);
+ return (wtxg);
}
static void
@@ -2534,7 +3156,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
lwb_t *lwb = zcw->zcw_lwb;
ASSERT3P(lwb, !=, NULL);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
+ ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
/*
* If the lwb has already been issued by another thread, we can
@@ -2543,13 +3165,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* do this prior to acquiring the zl_issuer_lock, to avoid
* acquiring it when it's not necessary to do so.
*/
- if (lwb->lwb_state == LWB_STATE_ISSUED ||
- lwb->lwb_state == LWB_STATE_WRITE_DONE ||
- lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+ if (lwb->lwb_state != LWB_STATE_OPENED)
return;
/*
- * In order to call zil_lwb_write_issue() we must hold the
+ * In order to call zil_lwb_write_close() we must hold the
* zilog's "zl_issuer_lock". We can't simply acquire that lock,
* since we're already holding the commit waiter's "zcw_lock",
* and those two locks are acquired in the opposite order
@@ -2567,8 +3187,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* the waiter is marked "done"), so without this check we could
* wind up with a use-after-free error below.
*/
- if (zcw->zcw_done)
- goto out;
+ if (zcw->zcw_done) {
+ mutex_exit(&zilog->zl_issuer_lock);
+ return;
+ }
ASSERT3P(lwb, ==, zcw->zcw_lwb);
@@ -2578,26 +3200,33 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* second time while holding the lock.
*
* We don't need to hold the zl_lock since the lwb cannot transition
- * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
- * _can_ transition from ISSUED to DONE, but it's OK to race with
+ * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
+ * _can_ transition from CLOSED to DONE, but it's OK to race with
* that transition since we treat the lwb the same, whether it's in
- * the ISSUED or DONE states.
+ * the CLOSED, ISSUED or DONE states.
*
* The important thing, is we treat the lwb differently depending on
- * if it's ISSUED or OPENED, and block any other threads that might
- * attempt to issue this lwb. For that reason we hold the
+ * if it's OPENED or CLOSED, and block any other threads that might
+ * attempt to close/issue this lwb. For that reason we hold the
* zl_issuer_lock when checking the lwb_state; we must not call
- * zil_lwb_write_issue() if the lwb had already been issued.
+ * zil_lwb_write_close() if the lwb had already been closed/issued.
*
* See the comment above the lwb_state_t structure definition for
* more details on the lwb states, and locking requirements.
*/
- if (lwb->lwb_state == LWB_STATE_ISSUED ||
- lwb->lwb_state == LWB_STATE_WRITE_DONE ||
- lwb->lwb_state == LWB_STATE_FLUSH_DONE)
- goto out;
+ if (lwb->lwb_state != LWB_STATE_OPENED) {
+ mutex_exit(&zilog->zl_issuer_lock);
+ return;
+ }
- ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+ /*
+ * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
+ * is still open. But we have to drop it to avoid a deadlock in case
+ * callback of zio issued by zil_lwb_write_issue() try to get it,
+ * while zil_lwb_write_issue() is blocked on attempt to issue next
+ * lwb it found in LWB_STATE_READY state.
+ */
+ mutex_exit(&zcw->zcw_lock);
/*
* As described in the comments above zil_commit_waiter() and
@@ -2605,55 +3234,27 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* since we've reached the commit waiter's timeout and it still
* hasn't been issued.
*/
- lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
+ zil_burst_done(zilog);
+ lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
- IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
-
- /*
- * Since the lwb's zio hadn't been issued by the time this thread
- * reached its timeout, we reset the zilog's "zl_cur_used" field
- * to influence the zil block size selection algorithm.
- *
- * By having to issue the lwb's zio here, it means the size of the
- * lwb was too large, given the incoming throughput of itxs. By
- * setting "zl_cur_used" to zero, we communicate this fact to the
- * block size selection algorithm, so it can take this information
- * into account, and potentially select a smaller size for the
- * next lwb block that is allocated.
- */
- zilog->zl_cur_used = 0;
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
if (nlwb == NULL) {
/*
- * When zil_lwb_write_issue() returns NULL, this
+ * When zil_lwb_write_close() returns NULL, this
* indicates zio_alloc_zil() failed to allocate the
* "next" lwb on-disk. When this occurs, the ZIL write
* pipeline must be stalled; see the comment within the
* zil_commit_writer_stall() function for more details.
- *
- * We must drop the commit waiter's lock prior to
- * calling zil_commit_writer_stall() or else we can wind
- * up with the following deadlock:
- *
- * - This thread is waiting for the txg to sync while
- * holding the waiter's lock; txg_wait_synced() is
- * used within txg_commit_writer_stall().
- *
- * - The txg can't sync because it is waiting for this
- * lwb's zio callback to call dmu_tx_commit().
- *
- * - The lwb's zio callback can't call dmu_tx_commit()
- * because it's blocked trying to acquire the waiter's
- * lock, which occurs prior to calling dmu_tx_commit()
*/
- mutex_exit(&zcw->zcw_lock);
+ zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
- mutex_enter(&zcw->zcw_lock);
+ mutex_exit(&zilog->zl_issuer_lock);
+ } else {
+ mutex_exit(&zilog->zl_issuer_lock);
+ zil_lwb_write_issue(zilog, lwb);
}
-
-out:
- mutex_exit(&zilog->zl_issuer_lock);
- ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+ mutex_enter(&zcw->zcw_lock);
}
/*
@@ -2667,7 +3268,7 @@ out:
* waited "long enough" and the lwb is still in the "open" state.
*
* Given a sufficient amount of itxs being generated and written using
- * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
+ * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
* function. If this does not occur, this secondary responsibility will
* ensure the lwb is issued even if there is not other synchronous
* activity on the system.
@@ -2718,7 +3319,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
* where it's "zcw_lwb" field is NULL, and it hasn't yet
* been skipped, so it's "zcw_done" field is still B_FALSE.
*/
- IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
ASSERT3B(timedout, ==, B_FALSE);
@@ -2766,6 +3367,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
*/
IMPLY(lwb != NULL,
+ lwb->lwb_state == LWB_STATE_CLOSED ||
+ lwb->lwb_state == LWB_STATE_READY ||
lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
lwb->lwb_state == LWB_STATE_FLUSH_DONE);
@@ -2812,7 +3415,14 @@ static void
zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
- VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+ /*
+ * Since we are not going to create any new dirty data, and we
+ * can even help with clearing the existing dirty data, we
+ * should not be subject to the dirty data based delays. We
+ * use TXG_NOTHROTTLE to bypass the delay mechanism.
+ */
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
itx->itx_sync = B_TRUE;
@@ -2869,7 +3479,7 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
* queue prior to zil_commit() having been called, and which itxs were
* added after zil_commit() was called.
*
- * The commit it is special; it doesn't have any on-disk representation.
+ * The commit itx is special; it doesn't have any on-disk representation.
* When a commit itx is "committed" to an lwb, the waiter associated
* with it is linked onto the lwb's list of waiters. Then, when that lwb
* completes, each waiter on the lwb's list is marked done and signaled
@@ -2884,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
* callback of the lwb's zio[*].
*
* * Actually, the waiters are signaled in the zio completion
- * callback of the root zio for the DKIOCFLUSHWRITECACHE commands
- * that are sent to the vdevs upon completion of the lwb zio.
+ * callback of the root zio for the flush commands that are sent to
+ * the vdevs upon completion of the lwb zio.
*
* 2. When the itxs are inserted into the ZIL's queue of uncommitted
* itxs, the order in which they are inserted is preserved[*]; as
@@ -2992,7 +3602,7 @@ zil_commit(zilog_t *zilog, uint64_t foid)
void
zil_commit_impl(zilog_t *zilog, uint64_t foid)
{
- ZIL_STAT_BUMP(zil_commit_count);
+ ZIL_STAT_BUMP(zilog, zil_commit_count);
/*
* Move the "async" itxs for the specified foid to the "sync"
@@ -3023,7 +3633,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
zil_commit_itx_assign(zilog, zcw);
- zil_commit_writer(zilog, zcw);
+ uint64_t wtxg = zil_commit_writer(zilog, zcw);
zil_commit_waiter(zilog, zcw);
if (zcw->zcw_zio_error != 0) {
@@ -3038,6 +3648,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
DTRACE_PROBE2(zil__commit__io__error,
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
txg_wait_synced(zilog->zl_dmu_pool, 0);
+ } else if (wtxg != 0) {
+ txg_wait_synced(zilog->zl_dmu_pool, wtxg);
}
zil_free_commit_waiter(zcw);
@@ -3062,6 +3674,8 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
if (spa_sync_pass(spa) != 1)
return;
+ zil_lwb_flush_wait_all(zilog, txg);
+
mutex_enter(&zilog->zl_lock);
ASSERT(zilog->zl_stop_sync == 0);
@@ -3074,11 +3688,13 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
if (zilog->zl_destroy_txg == txg) {
blkptr_t blk = zh->zh_log;
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
- ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
- bzero(zh, sizeof (zil_header_t));
- bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
+ memset(zh, 0, sizeof (zil_header_t));
+ memset(zilog->zl_replayed_seq, 0,
+ sizeof (zilog->zl_replayed_seq));
if (zilog->zl_keep_first) {
/*
@@ -3091,15 +3707,27 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
*/
zil_init_log_chain(zilog, &blk);
zh->zh_log = blk;
+ } else {
+ /*
+ * A destroyed ZIL chain can't contain any TX_SETSAXATTR
+ * records. So, deactivate the feature for this dataset.
+ * We activate it again when we start a new ZIL chain.
+ */
+ if (dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_ZILSAXATTR))
+ dsl_dataset_deactivate_feature(ds,
+ SPA_FEATURE_ZILSAXATTR, tx);
}
}
while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
zh->zh_log = lwb->lwb_blk;
- if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+ if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
+ lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
break;
list_remove(&zilog->zl_lwb_list, lwb);
- zio_free(spa, txg, &lwb->lwb_blk);
+ if (!BP_IS_HOLE(&lwb->lwb_blk))
+ zio_free(spa, txg, &lwb->lwb_blk);
zil_free_lwb(zilog, lwb);
/*
@@ -3108,29 +3736,17 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
* out the zil_header blkptr so that we don't end
* up freeing the same block twice.
*/
- if (list_head(&zilog->zl_lwb_list) == NULL)
+ if (list_is_empty(&zilog->zl_lwb_list))
BP_ZERO(&zh->zh_log);
}
- /*
- * Remove fastwrite on any blocks that have been pre-allocated for
- * the next commit. This prevents fastwrite counter pollution by
- * unused, long-lived LWBs.
- */
- for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
- if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) {
- metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
- lwb->lwb_fastwrite = 0;
- }
- }
-
mutex_exit(&zilog->zl_lock);
}
-/* ARGSUSED */
static int
zil_lwb_cons(void *vbuf, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
lwb_t *lwb = vbuf;
list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
@@ -3141,10 +3757,10 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag)
return (0);
}
-/* ARGSUSED */
static void
zil_lwb_dest(void *vbuf, void *unused)
{
+ (void) unused;
lwb_t *lwb = vbuf;
mutex_destroy(&lwb->lwb_vdev_lock);
avl_destroy(&lwb->lwb_vdev_tree);
@@ -3161,13 +3777,16 @@ zil_init(void)
zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
- zil_ksp = kstat_create("zfs", 0, "zil", "misc",
+ zil_sums_init(&zil_sums_global);
+ zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
- if (zil_ksp != NULL) {
- zil_ksp->ks_data = &zil_stats;
- kstat_install(zil_ksp);
+ if (zil_kstats_global != NULL) {
+ zil_kstats_global->ks_data = &zil_stats;
+ zil_kstats_global->ks_update = zil_kstats_global_update;
+ zil_kstats_global->ks_private = NULL;
+ kstat_install(zil_kstats_global);
}
}
@@ -3177,10 +3796,12 @@ zil_fini(void)
kmem_cache_destroy(zil_zcw_cache);
kmem_cache_destroy(zil_lwb_cache);
- if (zil_ksp != NULL) {
- kstat_delete(zil_ksp);
- zil_ksp = NULL;
+ if (zil_kstats_global != NULL) {
+ kstat_delete(zil_kstats_global);
+ zil_kstats_global = NULL;
}
+
+ zil_sums_fini(&zil_sums_global);
}
void
@@ -3212,10 +3833,13 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
zilog->zl_dirty_max_txg = 0;
zilog->zl_last_lwb_opened = NULL;
zilog->zl_last_lwb_latency = 0;
- zilog->zl_max_block_size = zil_maxblocksize;
+ zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
+ ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
+ spa_maxblocksize(dmu_objset_spa(os)));
mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
for (int i = 0; i < TXG_SIZE; i++) {
mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
@@ -3229,6 +3853,12 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
offsetof(itx_t, itx_node));
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+ cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
+
+ for (int i = 0; i < ZIL_BURSTS; i++) {
+ zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
+ sizeof (zil_chain_t);
+ }
return (zilog);
}
@@ -3264,8 +3894,10 @@ zil_free(zilog_t *zilog)
mutex_destroy(&zilog->zl_issuer_lock);
mutex_destroy(&zilog->zl_lock);
+ mutex_destroy(&zilog->zl_lwb_io_lock);
cv_destroy(&zilog->zl_cv_suspend);
+ cv_destroy(&zilog->zl_lwb_io_cv);
kmem_free(zilog, sizeof (zilog_t));
}
@@ -3274,7 +3906,7 @@ zil_free(zilog_t *zilog)
* Open an intent log.
*/
zilog_t *
-zil_open(objset_t *os, zil_get_data_t *get_data)
+zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
{
zilog_t *zilog = dmu_objset_zil(os);
@@ -3283,6 +3915,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
ASSERT(list_is_empty(&zilog->zl_lwb_list));
zilog->zl_get_data = get_data;
+ zilog->zl_sums = zil_sums;
return (zilog);
}
@@ -3299,23 +3932,33 @@ zil_close(zilog_t *zilog)
if (!dmu_objset_is_snapshot(zilog->zl_os)) {
zil_commit(zilog, 0);
} else {
- ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
ASSERT0(zilog->zl_dirty_max_txg);
ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
}
mutex_enter(&zilog->zl_lock);
+ txg = zilog->zl_dirty_max_txg;
lwb = list_tail(&zilog->zl_lwb_list);
- if (lwb == NULL)
- txg = zilog->zl_dirty_max_txg;
- else
- txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
+ if (lwb != NULL) {
+ txg = MAX(txg, lwb->lwb_alloc_txg);
+ txg = MAX(txg, lwb->lwb_max_txg);
+ }
mutex_exit(&zilog->zl_lock);
/*
- * We need to use txg_wait_synced() to wait long enough for the
- * ZIL to be clean, and to wait for all pending lwbs to be
- * written out.
+ * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
+ * on the time when the dmu_tx transaction is assigned in
+ * zil_lwb_write_issue().
+ */
+ mutex_enter(&zilog->zl_lwb_io_lock);
+ txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
+ mutex_exit(&zilog->zl_lwb_io_lock);
+
+ /*
+ * We need to use txg_wait_synced() to wait until that txg is synced.
+ * zil_sync() will guarantee all lwbs up to that txg have been
+ * written out, flushed, and cleaned.
*/
if (txg != 0)
txg_wait_synced(zilog->zl_dmu_pool, txg);
@@ -3332,22 +3975,17 @@ zil_close(zilog_t *zilog)
* We should have only one lwb left on the list; remove it now.
*/
mutex_enter(&zilog->zl_lock);
- lwb = list_head(&zilog->zl_lwb_list);
+ lwb = list_remove_head(&zilog->zl_lwb_list);
if (lwb != NULL) {
- ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-
- if (lwb->lwb_fastwrite)
- metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
-
- list_remove(&zilog->zl_lwb_list, lwb);
+ ASSERT(list_is_empty(&zilog->zl_lwb_list));
+ ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
zil_free_lwb(zilog, lwb);
}
mutex_exit(&zilog->zl_lock);
}
-static char *suspend_tag = "zil suspending";
+static const char *suspend_tag = "zil suspending";
/*
* Suspend an intent log. While in suspended mode, we still honor
@@ -3461,7 +4099,7 @@ zil_suspend(const char *osname, void **cookiep)
/*
* We need to use zil_commit_impl to ensure we wait for all
- * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed
+ * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
* to disk before proceeding. If we used zil_commit instead, it
* would just call txg_wait_synced(), because zl_suspend is set.
* txg_wait_synced() doesn't wait for these lwb's to be
@@ -3508,7 +4146,7 @@ zil_resume(void *cookie)
}
typedef struct zil_replay_arg {
- zil_replay_func_t **zr_replay;
+ zil_replay_func_t *const *zr_replay;
void *zr_arg;
boolean_t zr_byteswap;
char *zr_lr;
@@ -3570,7 +4208,7 @@ zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
/*
* Make a copy of the data so we can revise and extend it.
*/
- bcopy(lr, zr->zr_lr, reclen);
+ memcpy(zr->zr_lr, lr, reclen);
/*
* If this is a TX_WRITE with a blkptr, suck in the data.
@@ -3615,10 +4253,11 @@ zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
return (0);
}
-/* ARGSUSED */
static int
zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
{
+ (void) bp, (void) arg, (void) claim_txg;
+
zilog->zl_replay_blks++;
return (0);
@@ -3626,17 +4265,18 @@ zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
/*
* If this dataset has a non-empty intent log, replay it and destroy it.
+ * Return B_TRUE if there were any entries to replay.
*/
-void
-zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
+boolean_t
+zil_replay(objset_t *os, void *arg,
+ zil_replay_func_t *const replay_func[TX_MAX_TYPE])
{
zilog_t *zilog = dmu_objset_zil(os);
const zil_header_t *zh = zilog->zl_header;
zil_replay_arg_t zr;
if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
- zil_destroy(zilog, B_TRUE);
- return;
+ return (zil_destroy(zilog, B_TRUE));
}
zr.zr_replay = replay_func;
@@ -3659,6 +4299,8 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
zil_destroy(zilog, B_FALSE);
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
zilog->zl_replay = B_FALSE;
+
+ return (B_TRUE);
}
boolean_t
@@ -3677,13 +4319,12 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
return (B_FALSE);
}
-/* ARGSUSED */
int
zil_reset(const char *osname, void *arg)
{
- int error;
+ (void) arg;
- error = zil_suspend(osname, NULL);
+ int error = zil_suspend(osname, NULL);
/* EACCES means crypto key not loaded */
if ((error == EACCES) || (error == EBUSY))
return (SET_ERROR(error));
@@ -3714,9 +4355,11 @@ EXPORT_SYMBOL(zil_lwb_add_block);
EXPORT_SYMBOL(zil_bp_tree_add);
EXPORT_SYMBOL(zil_set_sync);
EXPORT_SYMBOL(zil_set_logbias);
+EXPORT_SYMBOL(zil_sums_init);
+EXPORT_SYMBOL(zil_sums_fini);
+EXPORT_SYMBOL(zil_kstat_values_update);
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
"ZIL block open timeout percentage");
ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
@@ -3725,9 +4368,11 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
"Disable ZIL cache flushes");
-ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
"Limit in bytes slog sync writes per commit");
-ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
"Limit in bytes of ZIL log block size");
-/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
+ "Limit in bytes WR_COPIED size");
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index c016fa323b41..d68d5ababe79 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,10 +20,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2023, 2024, Klara Inc.
* Copyright (c) 2019, Allan Jude
* Copyright (c) 2021, Datto, Inc.
*/
@@ -41,6 +41,7 @@
#include <sys/zio_checksum.h>
#include <sys/dmu_objset.h>
#include <sys/arc.h>
+#include <sys/brt.h>
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
@@ -57,33 +58,33 @@
* I/O type descriptions
* ==========================================================================
*/
-const char *zio_type_name[ZIO_TYPES] = {
+const char *const zio_type_name[ZIO_TYPES] = {
/*
* Note: Linux kernel thread name length is limited
* so these names will differ from upstream open zfs.
*/
- "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
+ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim"
};
int zio_dva_throttle_enabled = B_TRUE;
-int zio_deadman_log_all = B_FALSE;
+static int zio_deadman_log_all = B_FALSE;
/*
* ==========================================================================
* I/O kmem caches
* ==========================================================================
*/
-kmem_cache_t *zio_cache;
-kmem_cache_t *zio_link_cache;
+static kmem_cache_t *zio_cache;
+static kmem_cache_t *zio_link_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
-uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
-uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#endif
/* Mark IOs as "slow" if they take longer than 30 seconds */
-int zio_slow_io_ms = (30 * MILLISEC);
+static uint_t zio_slow_io_ms = (30 * MILLISEC);
#define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
@@ -114,9 +115,15 @@ int zio_slow_io_ms = (30 * MILLISEC);
* fragmented systems, which may have very few free segments of this size,
* and may need to load new metaslabs to satisfy 128K allocations.
*/
-int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
-int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */
-int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+
+/* defer frees starting in this pass */
+uint_t zfs_sync_pass_deferred_free = 2;
+
+/* don't compress starting in this pass */
+static uint_t zfs_sync_pass_dont_compress = 8;
+
+/* rewrite new bps starting in this pass */
+static uint_t zfs_sync_pass_rewrite = 2;
/*
* An allocating zio is one that either currently has the DVA allocate
@@ -129,12 +136,12 @@ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
* allocations as well.
*/
int zio_exclude_metadata = 0;
-int zio_requeue_io_start_cut_in_line = 1;
+static int zio_requeue_io_start_cut_in_line = 1;
#ifdef ZFS_DEBUG
-int zio_buf_debug_limit = 16384;
+static const int zio_buf_debug_limit = 16384;
#else
-int zio_buf_debug_limit = 0;
+static const int zio_buf_debug_limit = 0;
#endif
static inline void __zio_execute(zio_t *zio);
@@ -151,32 +158,22 @@ zio_init(void)
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
- /*
- * For small buffers, we want a cache for each multiple of
- * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
- * for each quarter-power of 2.
- */
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
- size_t p2 = size;
- size_t align = 0;
- size_t data_cflags, cflags;
-
- data_cflags = KMC_NODEBUG;
- cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
- KMC_NODEBUG : 0;
+ size_t align, cflags, data_cflags;
+ char name[32];
-#if defined(_ILP32) && defined(_KERNEL)
/*
- * Cache size limited to 1M on 32-bit platforms until ARC
- * buffers no longer require virtual address space.
+ * Create cache for each half-power of 2 size, starting from
+ * SPA_MINBLOCKSIZE. It should give us memory space efficiency
+ * of ~7/8, sufficient for transient allocations mostly using
+ * these caches.
*/
- if (size > zfs_max_recordsize)
- break;
-#endif
-
+ size_t p2 = size;
while (!ISP2(p2))
p2 &= p2 - 1;
+ if (!IS_P2ALIGNED(size, p2 / 2))
+ continue;
#ifndef _KERNEL
/*
@@ -187,47 +184,37 @@ zio_init(void)
*/
if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
continue;
- /*
- * Here's the problem - on 4K native devices in userland on
- * Linux using O_DIRECT, buffers must be 4K aligned or I/O
- * will fail with EINVAL, causing zdb (and others) to coredump.
- * Since userland probably doesn't need optimized buffer caches,
- * we just force 4K alignment on everything.
- */
- align = 8 * SPA_MINBLOCKSIZE;
-#else
- if (size < PAGESIZE) {
- align = SPA_MINBLOCKSIZE;
- } else if (IS_P2ALIGNED(size, p2 >> 2)) {
- align = PAGESIZE;
- }
#endif
- if (align != 0) {
- char name[36];
- if (cflags == data_cflags) {
- /*
- * Resulting kmem caches would be identical.
- * Save memory by creating only one.
- */
- (void) snprintf(name, sizeof (name),
- "zio_buf_comb_%lu", (ulong_t)size);
- zio_buf_cache[c] = kmem_cache_create(name,
- size, align, NULL, NULL, NULL, NULL, NULL,
- cflags);
- zio_data_buf_cache[c] = zio_buf_cache[c];
- continue;
- }
- (void) snprintf(name, sizeof (name), "zio_buf_%lu",
- (ulong_t)size);
- zio_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, cflags);
-
- (void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
- (ulong_t)size);
- zio_data_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, data_cflags);
+ if (IS_P2ALIGNED(size, PAGESIZE))
+ align = PAGESIZE;
+ else
+ align = 1 << (highbit64(size ^ (size - 1)) - 1);
+
+ cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+ KMC_NODEBUG : 0;
+ data_cflags = KMC_NODEBUG;
+ if (cflags == data_cflags) {
+ /*
+ * Resulting kmem caches would be identical.
+ * Save memory by creating only one.
+ */
+ (void) snprintf(name, sizeof (name),
+ "zio_buf_comb_%lu", (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size, align,
+ NULL, NULL, NULL, NULL, NULL, cflags);
+ zio_data_buf_cache[c] = zio_buf_cache[c];
+ continue;
}
+ (void) snprintf(name, sizeof (name), "zio_buf_%lu",
+ (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size, align,
+ NULL, NULL, NULL, NULL, NULL, cflags);
+
+ (void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+ (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
+ NULL, NULL, NULL, NULL, NULL, data_cflags);
}
while (--c != 0) {
@@ -308,6 +295,53 @@ zio_fini(void)
* ==========================================================================
*/
+#ifdef ZFS_DEBUG
+static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+#endif
+
+/*
+ * Use empty space after the buffer to detect overflows.
+ *
+ * Since zio_init() creates kmem caches only for certain set of buffer sizes,
+ * allocations of different sizes may have some unused space after the data.
+ * Filling part of that space with a known pattern on allocation and checking
+ * it on free should allow us to detect some buffer overflows.
+ */
+static void
+zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
+{
+#ifdef ZFS_DEBUG
+ size_t off = P2ROUNDUP(size, sizeof (ulong_t));
+ ulong_t *canary = p + off / sizeof (ulong_t);
+ size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
+ if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
+ cache[c] == cache[c + 1])
+ asize = (c + 2) << SPA_MINBLOCKSHIFT;
+ for (; off < asize; canary++, off += sizeof (ulong_t))
+ *canary = zio_buf_canary;
+#endif
+}
+
+static void
+zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
+{
+#ifdef ZFS_DEBUG
+ size_t off = P2ROUNDUP(size, sizeof (ulong_t));
+ ulong_t *canary = p + off / sizeof (ulong_t);
+ size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
+ if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
+ cache[c] == cache[c + 1])
+ asize = (c + 2) << SPA_MINBLOCKSHIFT;
+ for (; off < asize; canary++, off += sizeof (ulong_t)) {
+ if (unlikely(*canary != zio_buf_canary)) {
+ PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
+ p, size, (canary - p) * sizeof (ulong_t),
+ *canary, zio_buf_canary);
+ }
+ }
+#endif
+}
+
/*
* Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
* crashdump if the kernel panics, so use it judiciously. Obviously, it's
@@ -324,7 +358,9 @@ zio_buf_alloc(size_t size)
atomic_add_64(&zio_buf_cache_allocs[c], 1);
#endif
- return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+ void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
+ zio_buf_put_canary(p, size, zio_buf_cache, c);
+ return (p);
}
/*
@@ -340,7 +376,9 @@ zio_data_buf_alloc(size_t size)
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
- return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+ void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
+ zio_buf_put_canary(p, size, zio_data_buf_cache, c);
+ return (p);
}
void
@@ -353,6 +391,7 @@ zio_buf_free(void *buf, size_t size)
atomic_add_64(&zio_buf_cache_frees[c], 1);
#endif
+ zio_buf_check_canary(buf, size, zio_buf_cache, c);
kmem_cache_free(zio_buf_cache[c], buf);
}
@@ -363,12 +402,14 @@ zio_data_buf_free(void *buf, size_t size)
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+ zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
kmem_cache_free(zio_data_buf_cache[c], buf);
}
static void
zio_abd_free(void *abd, size_t size)
{
+ (void) size;
abd_free((abd_t *)abd);
}
@@ -514,8 +555,9 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
/*
* If this is an authenticated block, just check the MAC. It would be
- * nice to separate this out into its own flag, but for the moment
- * enum zio_flag is out of bits.
+ * nice to separate this out into its own flag, but when this was done,
+ * we had run out of bits in what is now zio_flag_t. Future cleanup
+ * could make this a flag bit.
*/
if (BP_IS_AUTHENTICATED(bp)) {
if (ot == DMU_OT_OBJSET) {
@@ -570,7 +612,8 @@ error:
if (ret == ECKSUM) {
zio->io_error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
- spa_log_error(spa, &zio->io_bookmark);
+ spa_log_error(spa, &zio->io_bookmark,
+ BP_GET_LOGICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, &zio->io_bookmark, zio, 0);
}
@@ -625,8 +668,6 @@ zio_unique_parent(zio_t *cio)
void
zio_add_child(zio_t *pio, zio_t *cio)
{
- zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
-
/*
* Logical I/Os can have logical, gang, or vdev children.
* Gang I/Os can have gang or vdev children.
@@ -635,6 +676,12 @@ zio_add_child(zio_t *pio, zio_t *cio)
*/
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+ /* Parent should not have READY stage if child doesn't have it. */
+ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
+ (cio->io_child_type != ZIO_CHILD_VDEV),
+ (pio->io_pipeline & ZIO_STAGE_READY) == 0);
+
+ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
zl->zl_parent = pio;
zl->zl_child = cio;
@@ -643,19 +690,53 @@ zio_add_child(zio_t *pio, zio_t *cio)
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+ uint64_t *countp = pio->io_children[cio->io_child_type];
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
- pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+ countp[w] += !cio->io_state[w];
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
- pio->io_child_count++;
- cio->io_parent_count++;
-
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
}
+void
+zio_add_child_first(zio_t *pio, zio_t *cio)
+{
+ /*
+ * Logical I/Os can have logical, gang, or vdev children.
+ * Gang I/Os can have gang or vdev children.
+ * Vdev I/Os can only have vdev children.
+ * The following ASSERT captures all of these constraints.
+ */
+ ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+
+ /* Parent should not have READY stage if child doesn't have it. */
+ IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
+ (cio->io_child_type != ZIO_CHILD_VDEV),
+ (pio->io_pipeline & ZIO_STAGE_READY) == 0);
+
+ zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+ zl->zl_parent = pio;
+ zl->zl_child = cio;
+
+ ASSERT(list_is_empty(&cio->io_parent_list));
+ list_insert_head(&cio->io_parent_list, zl);
+
+ mutex_enter(&pio->io_lock);
+
+ ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+ uint64_t *countp = pio->io_children[cio->io_child_type];
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+ countp[w] += !cio->io_state[w];
+
+ list_insert_head(&pio->io_child_list, zl);
+
+ mutex_exit(&pio->io_lock);
+}
+
static void
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
{
@@ -668,9 +749,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
- pio->io_child_count--;
- cio->io_parent_count--;
-
mutex_exit(&cio->io_lock);
mutex_exit(&pio->io_lock);
kmem_cache_free(zio_link_cache, zl);
@@ -725,7 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
/*
* If we can tell the caller to execute this parent next, do
- * so. Otherwise dispatch the parent zio as its own task.
+ * so. We do this if the parent's zio type matches the child's
+ * type, or if it's a zio_null() with no done callback, and so
+ * has no actual work to do. Otherwise dispatch the parent zio
+ * in its own taskq.
*
* Having the caller execute the parent when possible reduces
* locking on the zio taskq's, reduces context switch
@@ -744,7 +825,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
* parent-child relationships, as we do with the "mega zio"
* of writes for spa_sync(), and the chain of ZIL blocks.
*/
- if (next_to_executep != NULL && *next_to_executep == NULL) {
+ if (next_to_executep != NULL && *next_to_executep == NULL &&
+ (pio->io_type == zio->io_type ||
+ (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) {
*next_to_executep = pio;
} else {
zio_taskq_dispatch(pio, type, B_FALSE);
@@ -804,7 +887,7 @@ static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
void *private, zio_type_t type, zio_priority_t priority,
- enum zio_flag flags, vdev_t *vd, uint64_t offset,
+ zio_flag_t flags, vdev_t *vd, uint64_t offset,
const zbookmark_phys_t *zb, enum zio_stage stage,
enum zio_stage pipeline)
{
@@ -821,7 +904,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
- bzero(zio, sizeof (zio_t));
+ memset(zio, 0, sizeof (zio_t));
mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
@@ -842,12 +925,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
- zio->io_bp = (blkptr_t *)bp;
- zio->io_bp_copy = *bp;
- zio->io_bp_orig = *bp;
if (type != ZIO_TYPE_WRITE ||
- zio->io_child_type == ZIO_CHILD_DDT)
+ zio->io_child_type == ZIO_CHILD_DDT) {
+ zio->io_bp_copy = *bp;
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
+ } else {
+ zio->io_bp = (blkptr_t *)bp;
+ }
+ zio->io_bp_orig = *bp;
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
@@ -869,8 +954,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+ zio->io_allocator = ZIO_ALLOCATOR_NONE;
- zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+ zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
+ (pipeline & ZIO_STAGE_READY) == 0;
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
if (zb != NULL)
@@ -882,7 +969,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
- zio_add_child(pio, zio);
+ zio_add_child_first(pio, zio);
}
taskq_init_ent(&zio->io_tqent);
@@ -890,7 +977,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
return (zio);
}
-static void
+void
zio_destroy(zio_t *zio)
{
metaslab_trace_fini(&zio->io_alloc_list);
@@ -901,9 +988,13 @@ zio_destroy(zio_t *zio)
kmem_cache_free(zio_cache, zio);
}
+/*
+ * ZIO intended to be between others. Provides synchronization at READY
+ * and DONE pipeline stages and calls the respective callbacks.
+ */
zio_t *
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
- void *private, enum zio_flag flags)
+ void *private, zio_flag_t flags)
{
zio_t *zio;
@@ -914,10 +1005,22 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
return (zio);
}
+/*
+ * ZIO intended to be a root of a tree. Unlike null ZIO does not have a
+ * READY pipeline stage (is ready on creation), so it should not be used
+ * as child of any ZIO that may need waiting for grandchildren READY stage
+ * (any other ZIO type).
+ */
zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
{
- return (zio_null(NULL, spa, NULL, done, private, flags));
+ zio_t *zio;
+
+ zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
+ ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
+ ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
+
+ return (zio);
}
static int
@@ -931,9 +1034,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
(void) vsnprintf(buf, sizeof (buf), fmt, adx);
va_end(adx);
+ zfs_dbgmsg("bad blkptr at %px: "
+ "DVA[0]=%#llx/%#llx "
+ "DVA[1]=%#llx/%#llx "
+ "DVA[2]=%#llx/%#llx "
+ "prop=%#llx "
+ "pad=%#llx,%#llx "
+ "phys_birth=%#llx "
+ "birth=%#llx "
+ "fill=%#llx "
+ "cksum=%#llx/%#llx/%#llx/%#llx",
+ bp,
+ (long long)bp->blk_dva[0].dva_word[0],
+ (long long)bp->blk_dva[0].dva_word[1],
+ (long long)bp->blk_dva[1].dva_word[0],
+ (long long)bp->blk_dva[1].dva_word[1],
+ (long long)bp->blk_dva[2].dva_word[0],
+ (long long)bp->blk_dva[2].dva_word[1],
+ (long long)bp->blk_prop,
+ (long long)bp->blk_pad[0],
+ (long long)bp->blk_pad[1],
+ (long long)BP_GET_PHYSICAL_BIRTH(bp),
+ (long long)BP_GET_LOGICAL_BIRTH(bp),
+ (long long)bp->blk_fill,
+ (long long)bp->blk_cksum.zc_word[0],
+ (long long)bp->blk_cksum.zc_word[1],
+ (long long)bp->blk_cksum.zc_word[2],
+ (long long)bp->blk_cksum.zc_word[3]);
switch (blk_verify) {
case BLK_VERIFY_HALT:
- dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
zfs_panic_recover("%s: %s", spa_name(spa), buf);
break;
case BLK_VERIFY_LOG:
@@ -954,49 +1083,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
* If everything checks out B_TRUE is returned. The zfs_blkptr_verify
* argument controls the behavior when an invalid field is detected.
*
- * Modes for zfs_blkptr_verify:
- * 1) BLK_VERIFY_ONLY (evaluate the block)
- * 2) BLK_VERIFY_LOG (evaluate the block and log problems)
- * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error)
+ * Values for blk_verify_flag:
+ * BLK_VERIFY_ONLY: evaluate the block
+ * BLK_VERIFY_LOG: evaluate the block and log problems
+ * BLK_VERIFY_HALT: call zfs_panic_recover on error
+ *
+ * Values for blk_config_flag:
+ * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
+ * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
+ * obtained for reader
+ * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
+ * performance
*/
boolean_t
-zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
- enum blk_verify_flag blk_verify)
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
+ enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
{
int errors = 0;
if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p has invalid TYPE %llu",
+ "blkptr at %px has invalid TYPE %llu",
bp, (longlong_t)BP_GET_TYPE(bp));
}
- if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
- BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+ if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p has invalid CHECKSUM %llu",
+ "blkptr at %px has invalid CHECKSUM %llu",
bp, (longlong_t)BP_GET_CHECKSUM(bp));
}
- if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
- BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+ if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p has invalid COMPRESS %llu",
+ "blkptr at %px has invalid COMPRESS %llu",
bp, (longlong_t)BP_GET_COMPRESS(bp));
}
if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p has invalid LSIZE %llu",
+ "blkptr at %px has invalid LSIZE %llu",
bp, (longlong_t)BP_GET_LSIZE(bp));
}
if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p has invalid PSIZE %llu",
+ "blkptr at %px has invalid PSIZE %llu",
bp, (longlong_t)BP_GET_PSIZE(bp));
}
if (BP_IS_EMBEDDED(bp)) {
if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p has invalid ETYPE %llu",
+ "blkptr at %px has invalid ETYPE %llu",
bp, (longlong_t)BPE_GET_ETYPE(bp));
}
}
@@ -1008,17 +1142,27 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
if (!spa->spa_trust_config)
return (errors == 0);
- if (!config_held)
- spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
- else
+ switch (blk_config) {
+ case BLK_CONFIG_HELD:
ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
+ break;
+ case BLK_CONFIG_NEEDED:
+ spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
+ break;
+ case BLK_CONFIG_SKIP:
+ return (errors == 0);
+ default:
+ panic("invalid blk_config %u", blk_config);
+ }
+
/*
* Pool-specific checks.
*
- * Note: it would be nice to verify that the blk_birth and
- * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
- * allows the birth time of log blocks (and dmu_sync()-ed blocks
- * that are in the log) to be arbitrarily large.
+ * Note: it would be nice to verify that the logical birth
+ * and physical birth are not too large. However,
+ * spa_freeze() allows the birth time of log blocks (and
+ * dmu_sync()-ed blocks that are in the log) to be arbitrarily
+ * large.
*/
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
const dva_t *dva = &bp->blk_dva[i];
@@ -1026,20 +1170,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
if (vdevid >= spa->spa_root_vdev->vdev_children) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p DVA %u has invalid VDEV %llu",
+ "blkptr at %px DVA %u has invalid VDEV %llu",
bp, i, (longlong_t)vdevid);
continue;
}
vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
if (vd == NULL) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p DVA %u has invalid VDEV %llu",
+ "blkptr at %px DVA %u has invalid VDEV %llu",
bp, i, (longlong_t)vdevid);
continue;
}
if (vd->vdev_ops == &vdev_hole_ops) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p DVA %u has hole VDEV %llu",
+ "blkptr at %px DVA %u has hole VDEV %llu",
bp, i, (longlong_t)vdevid);
continue;
}
@@ -1057,13 +1201,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
asize = vdev_gang_header_asize(vd);
if (offset + asize > vd->vdev_asize) {
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
- "blkptr at %p DVA %u has invalid OFFSET %llu",
+ "blkptr at %px DVA %u has invalid OFFSET %llu",
bp, i, (longlong_t)offset);
}
}
- if (errors > 0)
- dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
- if (!config_held)
+ if (blk_config == BLK_CONFIG_NEEDED)
spa_config_exit(spa, SCL_VDEV, bp);
return (errors == 0);
@@ -1072,6 +1214,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
boolean_t
zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
{
+ (void) bp;
uint64_t vdevid = DVA_GET_VDEV(dva);
if (vdevid >= spa->spa_root_vdev->vdev_children)
@@ -1102,11 +1245,11 @@ zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
+ zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
- zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+ zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
data, size, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@@ -1119,9 +1262,8 @@ zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
- zio_done_func_t *physdone, zio_done_func_t *done,
- void *private, zio_priority_t priority, enum zio_flag flags,
- const zbookmark_phys_t *zb)
+ zio_done_func_t *done, void *private, zio_priority_t priority,
+ zio_flag_t flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
@@ -1141,7 +1283,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio->io_ready = ready;
zio->io_children_ready = children_ready;
- zio->io_physdone = physdone;
zio->io_prop = *zp;
/*
@@ -1163,7 +1304,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
uint64_t size, zio_done_func_t *done, void *private,
- zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
+ zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb)
{
zio_t *zio;
@@ -1175,12 +1316,14 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
}
void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
+ boolean_t brtwrite)
{
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+ ASSERT(!brtwrite || !nopwrite);
/*
* We must reset the io_prop to match the values that existed
@@ -1189,6 +1332,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
*/
zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
zio->io_prop.zp_nopwrite = nopwrite;
+ zio->io_prop.zp_brtwrite = brtwrite;
zio->io_prop.zp_copies = copies;
zio->io_bp_override = bp;
}
@@ -1197,7 +1341,7 @@ void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{
- (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT);
+ (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
/*
* The check for EMBEDDED is a performance optimization. We
@@ -1206,7 +1350,6 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
*/
if (BP_IS_EMBEDDED(bp))
return;
- metaslab_check_free(spa, bp);
/*
* Frees that are for the currently-syncing txg, are not going to be
@@ -1222,7 +1365,9 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
BP_GET_DEDUP(bp) ||
txg != spa->spa_syncing_txg ||
(spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
- !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
+ !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
+ brt_maybe_exists(spa, bp)) {
+ metaslab_check_free(spa, bp);
bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
} else {
VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
@@ -1236,7 +1381,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
*/
zio_t *
zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- enum zio_flag flags)
+ zio_flag_t flags)
{
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
@@ -1248,11 +1393,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
arc_freed(spa, bp);
dsl_scan_freed(spa, bp);
- if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
+ if (BP_IS_GANG(bp) ||
+ BP_GET_DEDUP(bp) ||
+ brt_maybe_exists(spa, bp)) {
/*
- * GANG and DEDUP blocks can induce a read (for the gang block
- * header, or the DDT), so issue them asynchronously so that
- * this thread is not tied up.
+ * GANG, DEDUP and BRT blocks can induce a read (for the gang
+ * block header, the DDT or the BRT), so issue them
+ * asynchronously so that this thread is not tied up.
*/
enum zio_stage stage =
ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
@@ -1269,12 +1416,12 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_t *
zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- zio_done_func_t *done, void *private, enum zio_flag flags)
+ zio_done_func_t *done, void *private, zio_flag_t flags)
{
zio_t *zio;
- (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
- BLK_VERIFY_HALT);
+ (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
+ BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
if (BP_IS_EMBEDDED(bp))
return (zio_null(pio, spa, NULL, NULL, NULL, 0));
@@ -1291,7 +1438,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
* starts allocating blocks -- so that nothing is allocated twice.
* If txg == 0 we just verify that the block is claimable.
*/
- ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
spa_min_claim_txg(spa));
ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */
@@ -1305,33 +1452,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
}
zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, enum zio_flag flags)
-{
- zio_t *zio;
- int c;
-
- if (vd->vdev_children == 0) {
- zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
- ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
- ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-
- zio->io_cmd = cmd;
- } else {
- zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
-
- for (c = 0; c < vd->vdev_children; c++)
- zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
- done, private, flags));
- }
-
- return (zio);
-}
-
-zio_t *
zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_done_func_t *done, void *private, zio_priority_t priority,
- enum zio_flag flags, enum trim_flag trim_flags)
+ zio_flag_t flags, enum trim_flag trim_flags)
{
zio_t *zio;
@@ -1351,7 +1474,7 @@ zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
abd_t *data, int checksum, zio_done_func_t *done, void *private,
- zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+ zio_priority_t priority, zio_flag_t flags, boolean_t labels)
{
zio_t *zio;
@@ -1372,7 +1495,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
abd_t *data, int checksum, zio_done_func_t *done, void *private,
- zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+ zio_priority_t priority, zio_flag_t flags, boolean_t labels)
{
zio_t *zio;
@@ -1409,7 +1532,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
abd_t *data, uint64_t size, int type, zio_priority_t priority,
- enum zio_flag flags, zio_done_func_t *done, void *private)
+ zio_flag_t flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
@@ -1468,22 +1591,17 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
flags &= ~ZIO_FLAG_IO_ALLOCATING;
}
-
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
- zio->io_physdone = pio->io_physdone;
- if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
- zio->io_logical->io_phys_children++;
-
return (zio);
}
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
- zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+ zio_type_t type, zio_priority_t priority, zio_flag_t flags,
zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -1499,12 +1617,29 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
return (zio);
}
+
+/*
+ * Send a flush command to the given vdev. Unlike most zio creation functions,
+ * the flush zios are issued immediately. You can wait on pio to pause until
+ * the flushes complete.
+ */
void
-zio_flush(zio_t *zio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd)
{
- zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
- NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+ const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY;
+
+ if (vd->vdev_nowritecache)
+ return;
+
+ if (vd->vdev_children == 0) {
+ zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
+ NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0,
+ NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
+ } else {
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ zio_flush(pio, vd->vdev_child[c]);
+ }
}
void
@@ -1528,6 +1663,19 @@ zio_shrink(zio_t *zio, uint64_t size)
}
/*
+ * Round provided allocation size up to a value that can be allocated
+ * by at least some vdev(s) in the pool with minimum or no additional
+ * padding and without extra space usage on others
+ */
+static uint64_t
+zio_roundup_alloc_size(spa_t *spa, uint64_t size)
+{
+ if (size > spa->spa_min_alloc)
+ return (roundup(size, spa->spa_gcd_alloc));
+ return (spa->spa_min_alloc);
+}
+
+/*
* ==========================================================================
* Prepare to read and write logical blocks
* ==========================================================================
@@ -1565,15 +1713,8 @@ zio_read_bp_init(zio_t *zio)
abd_return_buf_copy(zio->io_abd, data, psize);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
- ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
}
- if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
- zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-
- if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
- zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
@@ -1592,12 +1733,16 @@ zio_write_bp_init(zio_t *zio)
blkptr_t *bp = zio->io_bp;
zio_prop_t *zp = &zio->io_prop;
- ASSERT(bp->blk_birth != zio->io_txg);
- ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+ ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (zp->zp_brtwrite)
+ return (zio);
+
+ ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
+
if (BP_IS_EMBEDDED(bp))
return (zio);
@@ -1649,7 +1794,7 @@ zio_write_compress(zio_t *zio)
blkptr_t *bp = zio->io_bp;
uint64_t lsize = zio->io_lsize;
uint64_t psize = zio->io_size;
- int pass = 1;
+ uint32_t pass = 1;
/*
* If our children haven't all reached the ready stage,
@@ -1676,7 +1821,7 @@ zio_write_compress(zio_t *zio)
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
ASSERT(zio->io_bp_override == NULL);
- if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+ if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
@@ -1696,19 +1841,23 @@ zio_write_compress(zio_t *zio)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
- ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
- spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+ ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
+ MIN(zp->zp_copies, spa_max_replication(spa))
+ == BP_GET_NDVAS(bp));
}
/* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF &&
!(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
- void *cbuf = zio_buf_alloc(lsize);
- psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize,
+ void *cbuf = NULL;
+ psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize,
zp->zp_complevel);
- if (psize == 0 || psize >= lsize) {
+ if (psize == 0) {
compress = ZIO_COMPRESS_OFF;
- zio_buf_free(cbuf, lsize);
+ } else if (psize >= lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ if (cbuf != NULL)
+ zio_buf_free(cbuf, lsize);
} else if (!zp->zp_dedup && !zp->zp_encrypt &&
psize <= BPE_PAYLOAD_SIZE &&
zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
@@ -1719,7 +1868,7 @@ zio_write_compress(zio_t *zio)
BP_SET_TYPE(bp, zio->io_prop.zp_type);
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
zio_buf_free(cbuf, lsize);
- bp->blk_birth = zio->io_txg;
+ BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_EMBEDDED_DATA));
@@ -1733,9 +1882,8 @@ zio_write_compress(zio_t *zio)
* in that we charge for the padding used to fill out
* the last sector.
*/
- ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
- size_t rounded = (size_t)roundup(psize,
- spa->spa_min_alloc);
+ size_t rounded = (size_t)zio_roundup_alloc_size(spa,
+ psize);
if (rounded >= lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@@ -1771,9 +1919,15 @@ zio_write_compress(zio_t *zio)
zio->io_abd, NULL, lsize, zp->zp_complevel);
if (psize == 0 || psize >= lsize)
compress = ZIO_COMPRESS_OFF;
- } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
- size_t rounded = MIN((size_t)roundup(psize,
- spa->spa_min_alloc), lsize);
+ } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS &&
+ !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) {
+ /*
+ * If we are raw receiving an encrypted dataset we should not
+ * take this codepath because it will change the on-disk block
+ * and decryption will fail.
+ */
+ size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
+ lsize);
if (rounded != psize) {
abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
@@ -1795,7 +1949,7 @@ zio_write_compress(zio_t *zio)
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+ if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
VERIFY3U(psize, !=, 0);
@@ -1809,7 +1963,7 @@ zio_write_compress(zio_t *zio)
}
if (psize == 0) {
- if (zio->io_bp_orig.blk_birth != 0 &&
+ if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, zp->zp_type);
@@ -1869,7 +2023,6 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
- int flags = (cutinline ? TQ_FRONT : 0);
/*
* If we're a config writer or a probe, the normal issue and
@@ -1887,23 +2040,18 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
/*
* If this is a high priority I/O, then use the high priority taskq if
- * available.
+ * available or cut the line otherwise.
*/
- if ((zio->io_priority == ZIO_PRIORITY_NOW ||
- zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
- spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
- q++;
+ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) {
+ if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
+ q++;
+ else
+ cutinline = B_TRUE;
+ }
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
- /*
- * NB: We are assuming that the zio can only be dispatched
- * to a single taskq at a time. It would be a grievous error
- * to dispatch the zio to another taskq at the same time.
- */
- ASSERT(taskq_empty_ent(&zio->io_tqent));
- spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
- &zio->io_tqent);
+ spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline);
}
static boolean_t
@@ -1928,8 +2076,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
static zio_t *
zio_issue_async(zio_t *zio)
{
+ ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-
return (NULL);
}
@@ -2027,7 +2175,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
"delta=%llu queued=%llu io=%llu "
"path=%s "
"last=%llu type=%d "
- "priority=%d flags=0x%x stage=0x%x "
+ "priority=%d flags=0x%llx stage=0x%x "
"pipeline=0x%x pipeline-trace=0x%x "
"objset=%llu object=%llu "
"level=%llu blkid=%llu "
@@ -2037,8 +2185,8 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
(u_longlong_t)delta, pio->io_delta, pio->io_delay,
vd ? vd->vdev_path : "NULL",
vq ? vq->vq_io_complete_ts : 0, pio->io_type,
- pio->io_priority, pio->io_flags, pio->io_stage,
- pio->io_pipeline, pio->io_pipeline_trace,
+ pio->io_priority, (u_longlong_t)pio->io_flags,
+ pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
(u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
(u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid,
(u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size,
@@ -2065,7 +2213,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth)
* using the zfs_dbgmsg() interface then post deadman event for the ZED.
*/
void
-zio_deadman(zio_t *pio, char *tag)
+zio_deadman(zio_t *pio, const char *tag)
{
spa_t *spa = pio->io_spa;
char *name = spa_name(spa);
@@ -2143,6 +2291,8 @@ zio_execute_stack_check(zio_t *zio)
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
!zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
return (B_TRUE);
+#else
+ (void) zio;
#endif /* HAVE_LARGE_STACKS */
return (B_FALSE);
@@ -2241,6 +2391,9 @@ zio_wait(zio_t *zio)
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ spa_select_allocator(zio);
+ }
__zio_execute(zio);
mutex_enter(&zio->io_lock);
@@ -2277,7 +2430,7 @@ zio_nowait(zio_t *zio)
ASSERT3P(zio->io_executor, ==, NULL);
if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
- zio_unique_parent(zio) == NULL) {
+ list_is_empty(&zio->io_parent_list)) {
zio_t *pio;
/*
@@ -2293,6 +2446,9 @@ zio_nowait(zio_t *zio)
ASSERT0(zio->io_queued_timestamp);
zio->io_queued_timestamp = gethrtime();
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ spa_select_allocator(zio);
+ }
__zio_execute(zio);
}
@@ -2306,13 +2462,14 @@ static void
zio_reexecute(void *arg)
{
zio_t *pio = arg;
- zio_t *cio, *cio_next;
+ zio_t *cio, *cio_next, *gio;
ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
ASSERT(pio->io_gang_leader == NULL);
ASSERT(pio->io_gang_tree == NULL);
+ mutex_enter(&pio->io_lock);
pio->io_flags = pio->io_orig_flags;
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
@@ -2320,8 +2477,16 @@ zio_reexecute(void *arg)
pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_pipeline_trace = 0;
pio->io_error = 0;
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
- pio->io_state[w] = 0;
+ pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
+ (pio->io_pipeline & ZIO_STAGE_READY) == 0;
+ pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
+ zio_link_t *zl = NULL;
+ while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
+ for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
+ gio->io_children[pio->io_child_type][w] +=
+ !pio->io_state[w];
+ }
+ }
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
@@ -2335,12 +2500,9 @@ zio_reexecute(void *arg)
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
- zio_link_t *zl = NULL;
- mutex_enter(&pio->io_lock);
+ zl = NULL;
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
- for (int w = 0; w < ZIO_WAIT_TYPES; w++)
- pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
mutex_enter(&pio->io_lock);
@@ -2366,8 +2528,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));
- cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
- "failure and has been suspended.\n", spa_name(spa));
+ if (reason != ZIO_SUSPEND_MMP) {
+ cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
+ "I/O failure and has been suspended.\n", spa_name(spa));
+ }
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);
@@ -2555,11 +2719,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
return (zio);
}
-/* ARGSUSED */
static zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
+ (void) gn, (void) data, (void) offset;
+
zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
ZIO_GANG_CHILD_FLAGS(pio));
if (zio == NULL) {
@@ -2569,11 +2734,11 @@ zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
return (zio);
}
-/* ARGSUSED */
static zio_t *
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{
+ (void) gn, (void) data, (void) offset;
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
}
@@ -2652,7 +2817,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
blkptr_t *bp = zio->io_bp;
ASSERT(gio == zio_unique_parent(zio));
- ASSERT(zio->io_child_count == 0);
+ ASSERT(list_is_empty(&zio->io_child_list));
if (zio->io_error)
return;
@@ -2751,6 +2916,12 @@ zio_gang_issue(zio_t *zio)
}
static void
+zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
+{
+ cio->io_allocator = pio->io_allocator;
+}
+
+static void
zio_write_gang_member_ready(zio_t *zio)
{
zio_t *pio = zio_unique_parent(zio);
@@ -2768,7 +2939,7 @@ zio_write_gang_member_ready(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
- ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+ VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
mutex_enter(&pio->io_lock);
for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
@@ -2806,19 +2977,22 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
uint64_t resid = pio->io_size;
uint64_t lsize;
int copies = gio->io_prop.zp_copies;
- int gbh_copies;
zio_prop_t zp;
int error;
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
/*
- * encrypted blocks need DVA[2] free so encrypted gang headers can't
- * have a third copy.
+ * If one copy was requested, store 2 copies of the GBH, so that we
+ * can still traverse all the data (e.g. to free or scrub) even if a
+ * block is damaged. Note that we can't store 3 copies of the GBH in
+ * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
*/
- gbh_copies = MIN(copies + 1, spa_max_replication(spa));
- if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
- gbh_copies = SPA_DVAS_PER_BP - 1;
+ int gbh_copies = copies;
+ if (gbh_copies == 1) {
+ gbh_copies = MIN(2, spa_max_replication(spa));
+ }
+ ASSERT(ZIO_HAS_ALLOCATOR(pio));
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2872,7 +3046,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
gn = zio_gang_node_alloc(gnpp);
gbh = gn->gn_gbh;
- bzero(gbh, SPA_GANGBLOCKSIZE);
+ memset(gbh, 0, SPA_GANGBLOCKSIZE);
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
/*
@@ -2882,6 +3056,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ zio_gang_inherit_allocator(pio, zio);
+
/*
* Create and nowait the gang children.
*/
@@ -2901,17 +3077,19 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
zp.zp_nopwrite = B_FALSE;
zp.zp_encrypt = gio->io_prop.zp_encrypt;
zp.zp_byteorder = gio->io_prop.zp_byteorder;
- bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
- bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
- bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
+ memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
+ memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
+ memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
has_data ? abd_get_offset(pio->io_abd, pio->io_size -
resid) : NULL, lsize, lsize, &zp,
- zio_write_gang_member_ready, NULL, NULL,
+ zio_write_gang_member_ready, NULL,
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ zio_gang_inherit_allocator(zio, cio);
+
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(has_data);
@@ -2932,11 +3110,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
*/
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
- /*
- * We didn't allocate this bp, so make sure it doesn't get unmarked.
- */
- pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
-
zio_nowait(zio);
return (pio);
@@ -2967,6 +3140,7 @@ zio_nop_write(zio_t *zio)
blkptr_t *bp_orig = &zio->io_bp_orig;
zio_prop_t *zp = &zio->io_prop;
+ ASSERT(BP_IS_HOLE(bp));
ASSERT(BP_GET_LEVEL(bp) == 0);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
ASSERT(zp->zp_nopwrite);
@@ -3000,8 +3174,7 @@ zio_nop_write(zio_t *zio)
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
- ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
- sizeof (uint64_t)) == 0);
+ ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop);
/*
* If we're overwriting a block that is currently on an
@@ -3009,11 +3182,13 @@ zio_nop_write(zio_t *zio)
* allow a new block to be allocated on a concrete vdev.
*/
spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
- vdev_t *tvd = vdev_lookup_top(zio->io_spa,
- DVA_GET_VDEV(&bp->blk_dva[0]));
- if (tvd->vdev_ops == &vdev_indirect_ops) {
- spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
- return (zio);
+ for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) {
+ vdev_t *tvd = vdev_lookup_top(zio->io_spa,
+ DVA_GET_VDEV(&bp_orig->blk_dva[d]));
+ if (tvd->vdev_ops == &vdev_indirect_ops) {
+ spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+ return (zio);
+ }
}
spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
@@ -3027,6 +3202,35 @@ zio_nop_write(zio_t *zio)
/*
* ==========================================================================
+ * Block Reference Table
+ * ==========================================================================
+ */
+static zio_t *
+zio_brt_free(zio_t *zio)
+{
+ blkptr_t *bp;
+
+ bp = zio->io_bp;
+
+ if (BP_GET_LEVEL(bp) > 0 ||
+ BP_IS_METADATA(bp) ||
+ !brt_maybe_exists(zio->io_spa, bp)) {
+ return (zio);
+ }
+
+ if (!brt_entry_decref(zio->io_spa, bp)) {
+ /*
+ * This isn't the last reference, so we cannot free
+ * the data yet.
+ */
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ }
+
+ return (zio);
+}
+
+/*
+ * ==========================================================================
* Dedup
* ==========================================================================
*/
@@ -3332,14 +3536,14 @@ zio_ddt_write(zio_t *zio)
else
ddt_phys_addref(ddp);
} else if (zio->io_bp_override) {
- ASSERT(bp->blk_birth == txg);
+ ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
- zio_ddt_child_write_ready, NULL, NULL,
+ zio_ddt_child_write_ready, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@@ -3354,7 +3558,7 @@ zio_ddt_write(zio_t *zio)
return (zio);
}
-ddt_entry_t *freedde; /* for debugging */
+static ddt_entry_t *freedde; /* for debugging */
static zio_t *
zio_ddt_free(zio_t *zio)
@@ -3398,6 +3602,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
return (NULL);
ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
/*
* Try to place a reservation for this zio. If we're unable to
@@ -3434,21 +3639,12 @@ zio_dva_throttle(zio_t *zio)
}
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
- zbookmark_phys_t *bm = &zio->io_bookmark;
- /*
- * We want to try to use as many allocators as possible to help improve
- * performance, but we also want logically adjacent IOs to be physically
- * adjacent to improve sequential read performance. We chunk each object
- * into 2^20 block regions, and then hash based on the objset, object,
- * level, and region to accomplish both of these goals.
- */
- int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
- bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
- zio->io_allocator = allocator;
+ int allocator = zio->io_allocator;
zio->io_metaslab_class = mc;
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
@@ -3493,7 +3689,6 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
- flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
if (zio->io_flags & ZIO_FLAG_NODATA)
flags |= METASLAB_DONT_THROTTLE;
if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
@@ -3523,6 +3718,7 @@ zio_dva_allocate(zio_t *zio)
* sync write performance. If a log allocation fails, we will fall
* back to spa_sync() which is abysmal for performance.
*/
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
@@ -3611,11 +3807,13 @@ zio_dva_claim(zio_t *zio)
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
- ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+ ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(zio->io_bp_override == NULL);
- if (!BP_IS_HOLE(bp))
- metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+ if (!BP_IS_HOLE(bp)) {
+ metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
+ B_TRUE);
+ }
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -3653,7 +3851,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* of, so we just hash the objset ID to pick the allocator to get
* some parallelism.
*/
- int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+ int flags = METASLAB_ZIL;
int allocator = (uint_t)cityhash4(0, 0, 0,
os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
@@ -3755,7 +3953,7 @@ zio_vdev_io_start(zio_t *zio)
* Note: the code can handle other kinds of writes,
* but we don't expect them.
*/
- if (zio->io_vd->vdev_removing) {
+ if (zio->io_vd->vdev_noalloc) {
ASSERT(zio->io_flags &
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
@@ -3857,8 +4055,15 @@ zio_vdev_io_start(zio_t *zio)
zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_TRIM)) {
- if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
- return (zio);
+ if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
+ /*
+ * "no-op" injections return success, but do no actual
+ * work. Just skip the remaining vdev stages.
+ */
+ zio_vdev_io_bypass(zio);
+ zio_interrupt(zio);
+ return (NULL);
+ }
if ((zio = vdev_queue_io(zio)) == NULL)
return (NULL);
@@ -3887,17 +4092,17 @@ zio_vdev_io_done(zio_t *zio)
}
ASSERT(zio->io_type == ZIO_TYPE_READ ||
- zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
+ zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FLUSH ||
+ zio->io_type == ZIO_TYPE_TRIM);
if (zio->io_delay)
zio->io_delay = gethrtime() - zio->io_delay;
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
vd->vdev_ops != &vdev_draid_spare_ops) {
- vdev_queue_io_done(zio);
-
- if (zio->io_type == ZIO_TYPE_WRITE)
- vdev_cache_write(zio);
+ if (zio->io_type != ZIO_TYPE_FLUSH)
+ vdev_queue_io_done(zio);
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_device_injections(vd, zio,
@@ -3906,7 +4111,8 @@ zio_vdev_io_done(zio_t *zio)
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_label_injection(zio, EIO);
- if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
+ if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH &&
+ zio->io_type != ZIO_TYPE_TRIM) {
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
} else {
@@ -3917,7 +4123,7 @@ zio_vdev_io_done(zio_t *zio)
ops->vdev_op_io_done(zio);
- if (unexpected_error)
+ if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
VERIFY(vdev_probe(vd, zio) == NULL);
return (zio);
@@ -3964,7 +4170,6 @@ zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
}
-/*ARGSUSED*/
void
zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
{
@@ -4009,8 +4214,7 @@ zio_vdev_io_assess(zio_t *zio)
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
zio->io_error = 0;
- zio->io_flags |= ZIO_FLAG_IO_RETRY |
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
+ zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
zio_requeue_io_start_cut_in_line);
@@ -4043,20 +4247,12 @@ zio_vdev_io_assess(zio_t *zio)
* boolean flag so that we don't bother with it in the future.
*/
if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
- zio->io_type == ZIO_TYPE_IOCTL &&
- zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+ zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
vd->vdev_nowritecache = B_TRUE;
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
- if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
- zio->io_physdone != NULL) {
- ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
- ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
- zio->io_physdone(zio->io_logical);
- }
-
return (zio);
}
@@ -4305,12 +4501,12 @@ zio_checksum_verify(zio_t *zio)
zio->io_error = error;
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- (void) zfs_ereport_start_checksum(zio->io_spa,
- zio->io_vd, &zio->io_bookmark, zio,
- zio->io_offset, zio->io_size, &info);
mutex_enter(&zio->io_vd->vdev_stat_lock);
zio->io_vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&zio->io_vd->vdev_stat_lock);
+ (void) zfs_ereport_start_checksum(zio->io_spa,
+ zio->io_vd, &zio->io_bookmark, zio,
+ zio->io_offset, zio->io_size, &info);
}
}
@@ -4364,22 +4560,24 @@ zio_ready(zio_t *zio)
zio_t *pio, *pio_next;
zio_link_t *zl = NULL;
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
- ZIO_WAIT_READY)) {
+ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+ ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
return (NULL);
}
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
- ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
- (zio->io_flags & ZIO_FLAG_NOPWRITE));
+ ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+ BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
zio->io_ready(zio);
}
+#ifdef ZFS_DEBUG
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
+#endif
if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -4388,6 +4586,7 @@ zio_ready(zio_t *zio)
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_metaslab_class != NULL);
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
/*
* We were unable to allocate anything, unreserve and
@@ -4418,7 +4617,7 @@ zio_ready(zio_t *zio)
}
if (zio->io_flags & ZIO_FLAG_NODATA) {
- if (BP_IS_GANG(bp)) {
+ if (bp != NULL && BP_IS_GANG(bp)) {
zio->io_flags &= ~ZIO_FLAG_NODATA;
} else {
ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
@@ -4474,6 +4673,7 @@ zio_dva_throttle_done(zio_t *zio)
}
ASSERT(IO_IS_ALLOCATING(pio));
+ ASSERT(ZIO_HAS_ALLOCATOR(pio));
ASSERT3P(zio, !=, zio->io_logical);
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
@@ -4536,6 +4736,7 @@ zio_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_bp != NULL);
+ ASSERT(ZIO_HAS_ALLOCATOR(zio));
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
zio->io_allocator);
@@ -4551,7 +4752,7 @@ zio_done(zio_t *zio)
if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
ASSERT(zio->io_bp->blk_pad[0] == 0);
ASSERT(zio->io_bp->blk_pad[1] == 0);
- ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
+ ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
sizeof (blkptr_t)) == 0 ||
(zio->io_bp == zio_unique_parent(zio)->io_bp));
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
@@ -4662,7 +4863,8 @@ zio_done(zio_t *zio)
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
- spa_log_error(zio->io_spa, &zio->io_bookmark);
+ spa_log_error(zio->io_spa, &zio->io_bookmark,
+ BP_GET_LOGICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
}
@@ -4797,15 +4999,14 @@ zio_done(zio_t *zio)
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
- ASSERT(taskq_empty_ent(&zio->io_tqent));
- spa_taskq_dispatch_ent(zio->io_spa,
+ spa_taskq_dispatch(zio->io_spa,
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
- zio_reexecute, zio, 0, &zio->io_tqent);
+ zio_reexecute, zio, B_FALSE);
}
return (NULL);
}
- ASSERT(zio->io_child_count == 0);
+ ASSERT(list_is_empty(&zio->io_child_list));
ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
@@ -4820,12 +5021,6 @@ zio_done(zio_t *zio)
zfs_ereport_free_checksum(zcr);
}
- if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
- !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
- !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
- metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
- }
-
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as
@@ -4878,6 +5073,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
zio_encrypt,
zio_checksum_generate,
zio_nop_write,
+ zio_brt_free,
zio_ddt_read_start,
zio_ddt_read_done,
zio_ddt_write,
@@ -4998,7 +5194,7 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp,
{
zbookmark_phys_t mod_zb = *subtree_root;
mod_zb.zb_blkid++;
- ASSERT(last_block->zb_level == 0);
+ ASSERT0(last_block->zb_level);
/* The objset_phys_t isn't before anything. */
if (dnp == NULL)
@@ -5024,26 +5220,41 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp,
last_block) <= 0);
}
+/*
+ * This function is similar to zbookmark_subtree_completed(), but returns true
+ * if subtree_root is equal or ahead of last_block, i.e. still to be done.
+ */
+boolean_t
+zbookmark_subtree_tbd(const dnode_phys_t *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+ ASSERT0(last_block->zb_level);
+ if (dnp == NULL)
+ return (B_FALSE);
+ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+ 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root,
+ last_block) >= 0);
+}
+
EXPORT_SYMBOL(zio_type_name);
EXPORT_SYMBOL(zio_buf_alloc);
EXPORT_SYMBOL(zio_data_buf_alloc);
EXPORT_SYMBOL(zio_buf_free);
EXPORT_SYMBOL(zio_data_buf_free);
-/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
"Max I/O completion time (milliseconds) before marking it as slow");
ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
"Prioritize requeued I/O");
-ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW,
"Defer frees starting in this pass");
-ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW,
"Don't compress starting in this pass");
-ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW,
"Rewrite new bps starting in this pass");
ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
@@ -5051,4 +5262,3 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
"Log all slow ZIOs, not just those with vdevs");
-/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c
index e6b5c9588939..ce6772a40c8b 100644
--- a/sys/contrib/openzfs/module/zfs/zio_checksum.c
+++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -91,29 +91,29 @@
* invocation and passed to the checksum function.
*/
-/*ARGSUSED*/
static void
abd_checksum_off(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ (void) abd, (void) size, (void) ctx_template;
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
-/*ARGSUSED*/
static void
abd_fletcher_2_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ (void) ctx_template;
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_2_incremental_native, zcp);
}
-/*ARGSUSED*/
static void
abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ (void) ctx_template;
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_2_incremental_byteswap, zcp);
@@ -127,11 +127,11 @@ abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp)
fletcher_4_abd_ops.acf_fini(acdp);
}
-/*ARGSUSED*/
void
abd_fletcher_4_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ (void) ctx_template;
fletcher_4_ctx_t ctx;
zio_abd_checksum_data_t acd = {
@@ -144,11 +144,11 @@ abd_fletcher_4_native(abd_t *abd, uint64_t size,
}
-/*ARGSUSED*/
void
abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ (void) ctx_template;
fletcher_4_ctx_t ctx;
zio_abd_checksum_data_t acd = {
@@ -165,10 +165,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{NULL, NULL}, NULL, NULL, 0, "on"},
{{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "off"},
- {{abd_checksum_SHA256, abd_checksum_SHA256},
+ {{abd_checksum_sha256, abd_checksum_sha256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"label"},
- {{abd_checksum_SHA256, abd_checksum_SHA256},
+ {{abd_checksum_sha256, abd_checksum_sha256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"gang_header"},
{{abd_fletcher_2_native, abd_fletcher_2_byteswap},
@@ -177,14 +177,14 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
NULL, NULL, 0, "fletcher2"},
{{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
- {{abd_checksum_SHA256, abd_checksum_SHA256},
+ {{abd_checksum_sha256, abd_checksum_sha256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
{{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
{{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "noparity"},
- {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
+ {{abd_checksum_sha512_native, abd_checksum_sha512_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
{{abd_checksum_skein_native, abd_checksum_skein_byteswap},
@@ -195,6 +195,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+ {{abd_checksum_blake3_native, abd_checksum_blake3_byteswap},
+ abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"},
};
/*
@@ -207,6 +211,8 @@ zio_checksum_to_feature(enum zio_checksum cksum)
VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
switch (cksum) {
+ case ZIO_CHECKSUM_BLAKE3:
+ return (SPA_FEATURE_BLAKE3);
case ZIO_CHECKSUM_SHA512:
return (SPA_FEATURE_SHA512);
case ZIO_CHECKSUM_SKEIN:
@@ -266,7 +272,7 @@ static void
zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
{
const dva_t *dva = BP_IDENTITY(bp);
- uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+ uint64_t txg = BP_GET_BIRTH(bp);
ASSERT(BP_IS_GANG(bp));
@@ -351,17 +357,20 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
zio_eck_t eck;
size_t eck_offset;
- bzero(&saved, sizeof (zio_cksum_t));
+ memset(&saved, 0, sizeof (zio_cksum_t));
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t zilc;
abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
- size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
- uint64_t);
+ uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused,
+ ZIL_MIN_BLKSZ, uint64_t);
+ ASSERT3U(size, >=, nused);
+ size = nused;
eck = zilc.zc_eck;
eck_offset = offsetof(zil_chain_t, zc_eck);
} else {
+ ASSERT3U(size, >=, sizeof (zio_eck_t));
eck_offset = size - sizeof (zio_eck_t);
abd_copy_to_buf_off(&eck, abd, eck_offset,
sizeof (zio_eck_t));
@@ -417,6 +426,9 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
zio_checksum_template_init(checksum, spa);
+ IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED);
+ IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL);
+
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_cksum_t verifier;
size_t eck_offset;
@@ -439,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
return (SET_ERROR(ECKSUM));
}
- if (nused > size) {
+ nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+ if (size < nused)
return (SET_ERROR(ECKSUM));
- }
-
- size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+ size = nused;
} else {
+ if (size < sizeof (zio_eck_t))
+ return (SET_ERROR(ECKSUM));
eck_offset = size - sizeof (zio_eck_t);
abd_copy_to_buf_off(&eck, abd, eck_offset,
sizeof (zio_eck_t));
@@ -506,8 +519,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
}
if (info != NULL) {
- info->zbc_expected = expected_cksum;
- info->zbc_actual = actual_cksum;
info->zbc_checksum_name = ci->ci_name;
info->zbc_byteswapped = byteswap;
info->zbc_injected = 0;
diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c
index 1ff1e76d7f22..c8a10db7483b 100644
--- a/sys/contrib/openzfs/module/zfs/zio_compress.c
+++ b/sys/contrib/openzfs/module/zfs/zio_compress.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -44,7 +44,7 @@
* If nonzero, every 1/X decompression attempts will fail, simulating
* an undetected memory error.
*/
-unsigned long zio_decompress_fail_fraction = 0;
+static unsigned long zio_decompress_fail_fraction = 0;
/*
* Compression vectors.
@@ -66,7 +66,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{"gzip-9", 9, gzip_compress, gzip_decompress, NULL},
{"zle", 64, zle_compress, zle_decompress, NULL},
{"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL},
- {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress,
+ {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress_wrap,
zfs_zstd_decompress, zfs_zstd_decompress_level},
};
@@ -74,6 +74,7 @@ uint8_t
zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child,
uint8_t parent)
{
+ (void) spa;
uint8_t result;
if (!ZIO_COMPRESS_HASLEVEL(compress))
@@ -110,10 +111,11 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
return (result);
}
-/*ARGSUSED*/
static int
zio_compress_zeroed_cb(void *data, size_t len, void *private)
{
+ (void) private;
+
uint64_t *end = (uint64_t *)((char *)data + len);
for (uint64_t *word = (uint64_t *)data; word < end; word++)
if (*word != 0)
@@ -123,7 +125,7 @@ zio_compress_zeroed_cb(void *data, size_t len, void *private)
}
size_t
-zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len,
+zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
uint8_t level)
{
size_t c_len, d_len;
@@ -161,9 +163,12 @@ zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len,
ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT);
}
+ if (*dst == NULL)
+ *dst = zio_buf_alloc(s_len);
+
/* No compression algorithms can read from ABDs directly */
void *tmp = abd_borrow_buf_copy(src, s_len);
- c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel);
+ c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel);
abd_return_buf(src, tmp, s_len);
if (c_len > d_len)
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index feaf41dc65e3..012a0e3c6c17 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2024, Klara Inc.
*/
/*
@@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0;
typedef struct inject_handler {
int zi_id;
spa_t *zi_spa;
+ char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */
zinject_record_t zi_record;
uint64_t *zi_lanes;
int zi_next_lane;
@@ -148,7 +150,8 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
zb->zb_level == record->zi_level &&
zb->zb_blkid >= record->zi_start &&
zb->zb_blkid <= record->zi_end &&
- (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+ (record->zi_dvas == 0 ||
+ (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) &&
error == record->zi_error) {
return (freq_triggered(record->zi_freq));
}
@@ -161,7 +164,7 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
* specified by tag.
*/
void
-zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type)
{
inject_handler_t *handler;
@@ -341,15 +344,14 @@ zio_handle_label_injection(zio_t *zio, int error)
return (ret);
}
-/*ARGSUSED*/
static int
zio_inject_bitflip_cb(void *data, size_t len, void *private)
{
- zio_t *zio __maybe_unused = private;
+ zio_t *zio = private;
uint8_t *buffer = data;
uint_t byte = random_in_range(len);
- ASSERT(zio->io_type == ZIO_TYPE_READ);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
/* flip a single random bit in an abd data buffer */
buffer[byte] ^= 1 << random_in_range(8);
@@ -364,10 +366,10 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
int ret = 0;
/*
- * We skip over faults in the labels unless it's during
- * device open (i.e. zio == NULL).
+ * We skip over faults in the labels unless it's during device open
+ * (i.e. zio == NULL) or a device flush (offset is meaningless)
*/
- if (zio != NULL) {
+ if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) {
uint64_t offset = zio->io_offset;
if (offset < VDEV_LABEL_START_SIZE ||
@@ -605,6 +607,12 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
+ /* also match on I/O type (e.g., -T read) */
+ if (handler->zi_record.zi_iotype != ZIO_TYPES &&
+ handler->zi_record.zi_iotype != zio->io_type) {
+ continue;
+ }
+
/*
* Defensive; should never happen as the array allocation
* occurs prior to inserting this handler on the list.
@@ -699,6 +707,63 @@ zio_handle_io_delay(zio_t *zio)
return (min_target);
}
+static void
+zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command)
+{
+ inject_handler_t *handler;
+ hrtime_t delay = 0;
+ int id = 0;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers);
+ handler != NULL && handler->zi_record.zi_cmd == command;
+ handler = list_next(&inject_handlers, handler)) {
+ ASSERT3P(handler->zi_spa_name, !=, NULL);
+ if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) {
+ uint64_t pause =
+ SEC2NSEC(handler->zi_record.zi_duration);
+ if (pause > elapsed) {
+ delay = pause - elapsed;
+ }
+ id = handler->zi_id;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ if (delay) {
+ if (command == ZINJECT_DELAY_IMPORT) {
+ spa_import_progress_set_notes(spa, "injecting %llu "
+ "sec delay", (u_longlong_t)NSEC2SEC(delay));
+ }
+ zfs_sleep_until(gethrtime() + delay);
+ }
+ if (id) {
+ /* all done with this one-shot handler */
+ zio_clear_fault(id);
+ }
+}
+
+/*
+ * For testing, inject a delay during an import
+ */
+void
+zio_handle_import_delay(spa_t *spa, hrtime_t elapsed)
+{
+ zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT);
+}
+
+/*
+ * For testing, inject a delay during an export
+ */
+void
+zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
+{
+ zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
+}
+
static int
zio_calculate_range(const char *pool, zinject_record_t *record)
{
@@ -756,6 +821,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record)
return (0);
}
+static boolean_t
+zio_pool_handler_exists(const char *name, zinject_type_t command)
+{
+ boolean_t exists = B_FALSE;
+
+ rw_enter(&inject_lock, RW_READER);
+ for (inject_handler_t *handler = list_head(&inject_handlers);
+ handler != NULL; handler = list_next(&inject_handlers, handler)) {
+ if (command != handler->zi_record.zi_cmd)
+ continue;
+
+ const char *pool = (handler->zi_spa_name != NULL) ?
+ handler->zi_spa_name : spa_name(handler->zi_spa);
+ if (strcmp(name, pool) == 0) {
+ exists = B_TRUE;
+ break;
+ }
+ }
+ rw_exit(&inject_lock);
+
+ return (exists);
+}
/*
* Create a new handler for the given record. We add it to the list, adding
* a reference to the spa_t in the process. We increment zio_injection_enabled,
@@ -806,16 +893,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
if (!(flags & ZINJECT_NULL)) {
/*
- * spa_inject_ref() will add an injection reference, which will
- * prevent the pool from being removed from the namespace while
- * still allowing it to be unloaded.
+ * Pool delays for import or export don't take an
+ * injection reference on the spa. Instead they
+ * rely on matching by name.
*/
- if ((spa = spa_inject_addref(name)) == NULL)
- return (SET_ERROR(ENOENT));
+ if (record->zi_cmd == ZINJECT_DELAY_IMPORT ||
+ record->zi_cmd == ZINJECT_DELAY_EXPORT) {
+ if (record->zi_duration <= 0)
+ return (SET_ERROR(EINVAL));
+ /*
+ * Only one import | export delay handler per pool.
+ */
+ if (zio_pool_handler_exists(name, record->zi_cmd))
+ return (SET_ERROR(EEXIST));
+
+ mutex_enter(&spa_namespace_lock);
+ boolean_t has_spa = spa_lookup(name) != NULL;
+ mutex_exit(&spa_namespace_lock);
+
+ if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
+ return (SET_ERROR(EEXIST));
+ if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa)
+ return (SET_ERROR(ENOENT));
+ spa = NULL;
+ } else {
+ /*
+ * spa_inject_ref() will add an injection reference,
+ * which will prevent the pool from being removed
+ * from the namespace while still allowing it to be
+ * unloaded.
+ */
+ if ((spa = spa_inject_addref(name)) == NULL)
+ return (SET_ERROR(ENOENT));
+ }
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
-
- handler->zi_spa = spa;
+ handler->zi_spa = spa; /* note: can be NULL */
handler->zi_record = *record;
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
@@ -828,6 +941,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
handler->zi_next_lane = 0;
}
+ if (handler->zi_spa == NULL)
+ handler->zi_spa_name = spa_strdup(name);
+ else
+ handler->zi_spa_name = NULL;
+
rw_enter(&inject_lock, RW_WRITER);
/*
@@ -887,7 +1005,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
if (handler) {
*record = handler->zi_record;
*id = handler->zi_id;
- (void) strncpy(name, spa_name(handler->zi_spa), buflen);
+ ASSERT(handler->zi_spa || handler->zi_spa_name);
+ if (handler->zi_spa != NULL)
+ (void) strlcpy(name, spa_name(handler->zi_spa), buflen);
+ else
+ (void) strlcpy(name, handler->zi_spa_name, buflen);
ret = 0;
} else {
ret = SET_ERROR(ENOENT);
@@ -937,7 +1059,11 @@ zio_clear_fault(int id)
ASSERT3P(handler->zi_lanes, ==, NULL);
}
- spa_inject_delref(handler->zi_spa);
+ if (handler->zi_spa_name != NULL)
+ spa_strfree(handler->zi_spa_name);
+
+ if (handler->zi_spa != NULL)
+ spa_inject_delref(handler->zi_spa);
kmem_free(handler, sizeof (inject_handler_t));
atomic_dec_32(&zio_injection_enabled);
diff --git a/sys/contrib/openzfs/module/zfs/zle.c b/sys/contrib/openzfs/module/zfs/zle.c
index 0decebb13ca7..1483a65af803 100644
--- a/sys/contrib/openzfs/module/zfs/zle.c
+++ b/sys/contrib/openzfs/module/zfs/zle.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/zfs/zrlock.c b/sys/contrib/openzfs/module/zfs/zrlock.c
index a4def6053622..0d50cc4712ca 100644
--- a/sys/contrib/openzfs/module/zfs/zrlock.c
+++ b/sys/contrib/openzfs/module/zfs/zrlock.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -106,16 +106,16 @@ zrl_add_impl(zrlock_t *zrl, const char *zc)
void
zrl_remove(zrlock_t *zrl)
{
- uint32_t n;
-
#ifdef ZFS_DEBUG
if (zrl->zr_owner == curthread) {
zrl->zr_owner = NULL;
zrl->zr_caller = NULL;
}
+ int32_t n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT3S(n, >=, 0);
+#else
+ atomic_dec_32((uint32_t *)&zrl->zr_refcount);
#endif
- n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
- ASSERT3S((int32_t)n, >=, 0);
}
int
diff --git a/sys/contrib/openzfs/module/zfs/zthr.c b/sys/contrib/openzfs/module/zfs/zthr.c
index 33fdda7b68d1..02b9f0805dd7 100644
--- a/sys/contrib/openzfs/module/zfs/zthr.c
+++ b/sys/contrib/openzfs/module/zfs/zthr.c
@@ -231,7 +231,7 @@ struct zthr {
const char *zthr_name;
};
-static void
+static __attribute__((noreturn)) void
zthr_procedure(void *arg)
{
zthr_t *t = arg;
@@ -469,6 +469,12 @@ zthr_iscancelled(zthr_t *t)
return (cancelled);
}
+boolean_t
+zthr_iscurthread(zthr_t *t)
+{
+ return (t->zthr_thread == curthread);
+}
+
/*
* Wait for the zthr to finish its current function. Similar to
* zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index d50cce7d7357..5b6a3f5cb410 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -90,9 +90,8 @@ unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
struct hlist_head *zvol_htable;
-list_t zvol_state_list;
+static list_t zvol_state_list;
krwlock_t zvol_state_lock;
-const zvol_platform_ops_t *ops;
typedef enum {
ZVOL_ASYNC_REMOVE_MINORS,
@@ -112,13 +111,10 @@ typedef struct {
uint64_t
zvol_name_hash(const char *name)
{
- int i;
uint64_t crc = -1ULL;
- const uint8_t *p = (const uint8_t *)name;
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
- for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
+ for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++)
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
- }
return (crc);
}
@@ -139,8 +135,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
zv = hlist_entry(p, zvol_state_t, zv_hlink);
mutex_enter(&zv->zv_state_lock);
- if (zv->zv_hash == hash &&
- strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
+ if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) {
/*
* this is the right zvol, take the locks in the
* right order
@@ -155,8 +150,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
* to hold zvol_state_lock
*/
ASSERT(zv->zv_hash == hash &&
- strncmp(zv->zv_name, name, MAXNAMELEN)
- == 0);
+ strcmp(zv->zv_name, name) == 0);
}
rw_exit(&zvol_state_lock);
return (zv);
@@ -365,12 +359,46 @@ out:
mutex_exit(&zv->zv_state_lock);
if (error == 0 && zv != NULL)
- ops->zv_update_volsize(zv, volsize);
+ zvol_os_update_volsize(zv, volsize);
return (SET_ERROR(error));
}
/*
+ * Update volthreading.
+ */
+int
+zvol_set_volthreading(const char *name, boolean_t value)
+{
+ zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
+ if (zv == NULL)
+ return (ENOENT);
+ zv->zv_threading = value;
+ mutex_exit(&zv->zv_state_lock);
+ return (0);
+}
+
+/*
+ * Update zvol ro property.
+ */
+int
+zvol_set_ro(const char *name, boolean_t value)
+{
+ zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
+ if (zv == NULL)
+ return (-1);
+ if (value) {
+ zvol_os_set_disk_ro(zv, 1);
+ zv->zv_flags |= ZVOL_RDONLY;
+ } else {
+ zvol_os_set_disk_ro(zv, 0);
+ zv->zv_flags &= ~ZVOL_RDONLY;
+ }
+ mutex_exit(&zv->zv_state_lock);
+ return (0);
+}
+
+/*
* Sanity check volume block size.
*/
int
@@ -418,6 +446,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
lr_truncate_t *lr = arg2;
uint64_t offset, length;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -430,7 +460,7 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
if (error != 0) {
dmu_tx_abort(tx);
} else {
- zil_replaying(zv->zv_zilog, tx);
+ (void) zil_replaying(zv->zv_zilog, tx);
dmu_tx_commit(tx);
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
length);
@@ -454,6 +484,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_tx_t *tx;
int error;
+ ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -476,7 +508,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
dmu_tx_abort(tx);
} else {
dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
- zil_replaying(zv->zv_zilog, tx);
+ (void) zil_replaying(zv->zv_zilog, tx);
dmu_tx_commit(tx);
}
@@ -486,6 +518,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
static int
zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
{
+ (void) arg1, (void) arg2, (void) byteswap;
return (SET_ERROR(ENOTSUP));
}
@@ -493,7 +526,7 @@ zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
* Callback vectors for replaying records.
* Only TX_WRITE and TX_TRUNCATE are needed for zvol.
*/
-zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* no such transaction type */
zvol_replay_err, /* TX_CREATE */
zvol_replay_err, /* TX_MKDIR */
@@ -513,6 +546,10 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* TX_MKDIR_ATTR */
zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
zvol_replay_err, /* TX_WRITE2 */
+ zvol_replay_err, /* TX_SETSAXATTR */
+ zvol_replay_err, /* TX_RENAME_EXCHANGE */
+ zvol_replay_err, /* TX_RENAME_WHITEOUT */
+ zvol_replay_err, /* TX_CLONE_RANGE */
};
/*
@@ -521,11 +558,11 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
* We store data in the log buffers if it's small enough.
* Otherwise we will later flush the data out via dmu_sync().
*/
-ssize_t zvol_immediate_write_sz = 32768;
+static const ssize_t zvol_immediate_write_sz = 32768;
void
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
- uint64_t size, int sync)
+ uint64_t size, boolean_t commit)
{
uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
@@ -540,7 +577,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
else if (!spa_has_slogs(zilog->zl_spa) &&
size >= blocksize && blocksize > zvol_immediate_write_sz)
write_state = WR_INDIRECT;
- else if (sync)
+ else if (commit)
write_state = WR_COPIED;
else
write_state = WR_NEED_COPY;
@@ -575,7 +612,6 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
BP_ZERO(&lr->lr_blkptr);
itx->itx_private = zv;
- itx->itx_sync = sync;
(void) zil_itx_assign(zilog, itx, tx);
@@ -592,8 +628,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
* Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
*/
void
-zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
- boolean_t sync)
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len)
{
itx_t *itx;
lr_truncate_t *lr;
@@ -608,15 +643,14 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
lr->lr_offset = off;
lr->lr_length = len;
- itx->itx_sync = sync;
zil_itx_assign(zilog, itx, tx);
}
-/* ARGSUSED */
static void
zvol_get_done(zgd_t *zgd, int error)
{
+ (void) error;
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
@@ -640,10 +674,9 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error;
ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
- zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
/*
@@ -659,6 +692,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
} else { /* indirect write */
+ ASSERT3P(zio, !=, NULL);
/*
* Have to lock the whole block to ensure when it's written out
* and its checksum is being calculated that no one can change
@@ -669,8 +703,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
offset = P2ALIGN_TYPED(offset, size, uint64_t);
zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
size, RL_READER);
- error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
+ error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
+ &db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@@ -746,15 +780,15 @@ zvol_setup_zv(zvol_state_t *zv)
if (error)
return (SET_ERROR(error));
- ops->zv_set_capacity(zv, volsize >> 9);
+ zvol_os_set_capacity(zv, volsize >> 9);
zv->zv_volsize = volsize;
if (ro || dmu_objset_is_snapshot(os) ||
!spa_writeable(dmu_objset_spa(os))) {
- ops->zv_set_disk_ro(zv, 1);
+ zvol_os_set_disk_ro(zv, 1);
zv->zv_flags |= ZVOL_RDONLY;
} else {
- ops->zv_set_disk_ro(zv, 0);
+ zvol_os_set_disk_ro(zv, 0);
zv->zv_flags &= ~ZVOL_RDONLY;
}
return (0);
@@ -867,54 +901,26 @@ int
zvol_first_open(zvol_state_t *zv, boolean_t readonly)
{
objset_t *os;
- int error, locked = 0;
- boolean_t ro;
+ int error;
ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(mutex_owned(&spa_namespace_lock));
- /*
- * In all other cases the spa_namespace_lock is taken before the
- * bdev->bd_mutex lock. But in this case the Linux __blkdev_get()
- * function calls fops->open() with the bdev->bd_mutex lock held.
- * This deadlock can be easily observed with zvols used as vdevs.
- *
- * To avoid a potential lock inversion deadlock we preemptively
- * try to take the spa_namespace_lock(). Normally it will not
- * be contended and this is safe because spa_open_common() handles
- * the case where the caller already holds the spa_namespace_lock.
- *
- * When it is contended we risk a lock inversion if we were to
- * block waiting for the lock. Luckily, the __blkdev_get()
- * function allows us to return -ERESTARTSYS which will result in
- * bdev->bd_mutex being dropped, reacquired, and fops->open() being
- * called again. This process can be repeated safely until both
- * locks are acquired.
- */
- if (!mutex_owned(&spa_namespace_lock)) {
- locked = mutex_tryenter(&spa_namespace_lock);
- if (!locked)
- return (SET_ERROR(EINTR));
- }
-
- ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+ boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
if (error)
- goto out_mutex;
+ return (SET_ERROR(error));
zv->zv_objset = os;
error = zvol_setup_zv(zv);
-
if (error) {
dmu_objset_disown(os, 1, zv);
zv->zv_objset = NULL;
}
-out_mutex:
- if (locked)
- mutex_exit(&spa_namespace_lock);
- return (SET_ERROR(error));
+ return (error);
}
void
@@ -951,7 +957,7 @@ zvol_prefetch_minors_impl(void *arg)
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
FTAG, &os);
if (job->error == 0) {
- dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+ dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
dmu_objset_disown(os, B_TRUE, FTAG);
}
}
@@ -1053,8 +1059,7 @@ zvol_add_clones(const char *dsname, list_t *minors_list)
out:
if (dd != NULL)
dsl_dir_rele(dd, FTAG);
- if (dp != NULL)
- dsl_pool_rele(dp, FTAG);
+ dsl_pool_rele(dp, FTAG);
}
/*
@@ -1102,7 +1107,7 @@ zvol_create_minors_cb(const char *dsname, void *arg)
* traverse snapshots only, do not traverse children,
* and skip the 'dsname'
*/
- error = dmu_objset_find(dsname,
+ (void) dmu_objset_find(dsname,
zvol_create_snap_minor_cb, (void *)job,
DS_FIND_SNAPSHOTS);
}
@@ -1146,7 +1151,7 @@ zvol_create_minors_recursive(const char *name)
* taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
* any lock because all list operation is done on the current thread.
*
- * We will use this list to do zvol_create_minor_impl after prefetch
+ * We will use this list to do zvol_os_create_minor after prefetch
* so we don't have to traverse using dmu_objset_find again.
*/
list_create(&minors_list, sizeof (minors_job_t),
@@ -1160,7 +1165,7 @@ zvol_create_minors_recursive(const char *name)
&snapdev, NULL);
if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
- (void) ops->zv_create_minor(name);
+ (void) zvol_os_create_minor(name);
} else {
fstrans_cookie_t cookie = spl_fstrans_mark();
(void) dmu_objset_find(name, zvol_create_minors_cb,
@@ -1171,13 +1176,12 @@ zvol_create_minors_recursive(const char *name)
taskq_wait_outstanding(system_taskq, 0);
/*
- * Prefetch is completed, we can do zvol_create_minor_impl
+ * Prefetch is completed, we can do zvol_os_create_minor
* sequentially.
*/
- while ((job = list_head(&minors_list)) != NULL) {
- list_remove(&minors_list, job);
+ while ((job = list_remove_head(&minors_list)) != NULL) {
if (!job->error)
- (void) ops->zv_create_minor(job->name);
+ (void) zvol_os_create_minor(job->name);
kmem_strfree(job->name);
kmem_free(job, sizeof (minors_job_t));
}
@@ -1207,9 +1211,9 @@ zvol_create_minor(const char *name)
"snapdev", &snapdev, NULL);
if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
- (void) ops->zv_create_minor(name);
+ (void) zvol_os_create_minor(name);
} else {
- (void) ops->zv_create_minor(name);
+ (void) zvol_os_create_minor(name);
}
}
@@ -1220,7 +1224,7 @@ zvol_create_minor(const char *name)
static void
zvol_free_task(void *arg)
{
- ops->zv_free(arg);
+ zvol_os_free(arg);
}
void
@@ -1265,7 +1269,7 @@ zvol_remove_minors_impl(const char *name)
* Cleared while holding zvol_state_lock as a writer
* which will prevent zvol_open() from opening it.
*/
- ops->zv_clear_private(zv);
+ zvol_os_clear_private(zv);
/* Drop zv_state_lock before zvol_free() */
mutex_exit(&zv->zv_state_lock);
@@ -1282,10 +1286,8 @@ zvol_remove_minors_impl(const char *name)
rw_exit(&zvol_state_lock);
/* Drop zvol_state_lock before calling zvol_free() */
- while ((zv = list_head(&free_list)) != NULL) {
- list_remove(&free_list, zv);
- ops->zv_free(zv);
- }
+ while ((zv = list_remove_head(&free_list)) != NULL)
+ zvol_os_free(zv);
}
/* Remove minor for this specific volume only */
@@ -1317,7 +1319,7 @@ zvol_remove_minor_impl(const char *name)
}
zvol_remove(zv);
- ops->zv_clear_private(zv);
+ zvol_os_clear_private(zv);
mutex_exit(&zv->zv_state_lock);
break;
} else {
@@ -1329,7 +1331,7 @@ zvol_remove_minor_impl(const char *name)
rw_exit(&zvol_state_lock);
if (zv != NULL)
- ops->zv_free(zv);
+ zvol_os_free(zv);
}
/*
@@ -1339,13 +1341,12 @@ static void
zvol_rename_minors_impl(const char *oldname, const char *newname)
{
zvol_state_t *zv, *zv_next;
- int oldnamelen, newnamelen;
+ int oldnamelen;
if (zvol_inhibit_dev)
return;
oldnamelen = strlen(oldname);
- newnamelen = strlen(newname);
rw_enter(&zvol_state_lock, RW_READER);
@@ -1355,14 +1356,14 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, oldname) == 0) {
- ops->zv_rename_minor(zv, newname);
+ zvol_os_rename_minor(zv, newname);
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
(zv->zv_name[oldnamelen] == '/' ||
zv->zv_name[oldnamelen] == '@')) {
char *name = kmem_asprintf("%s%c%s", newname,
zv->zv_name[oldnamelen],
zv->zv_name + oldnamelen + 1);
- ops->zv_rename_minor(zv, name);
+ zvol_os_rename_minor(zv, name);
kmem_strfree(name);
}
@@ -1386,7 +1387,7 @@ zvol_set_snapdev_cb(const char *dsname, void *param)
switch (arg->snapdev) {
case ZFS_SNAPDEV_VISIBLE:
- (void) ops->zv_create_minor(dsname);
+ (void) zvol_os_create_minor(dsname);
break;
case ZFS_SNAPDEV_HIDDEN:
(void) zvol_remove_minor_impl(dsname);
@@ -1443,14 +1444,14 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
case ZFS_VOLMODE_GEOM:
case ZFS_VOLMODE_DEV:
(void) zvol_remove_minor_impl(name);
- (void) ops->zv_create_minor(name);
+ (void) zvol_os_create_minor(name);
break;
case ZFS_VOLMODE_DEFAULT:
(void) zvol_remove_minor_impl(name);
if (zvol_volmode == ZFS_VOLMODE_NONE)
break;
else /* if zvol_volmode is invalid defaults to "geom" */
- (void) ops->zv_create_minor(name);
+ (void) zvol_os_create_minor(name);
break;
}
spl_fstrans_unmark(cookie);
@@ -1470,9 +1471,9 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
task->op = op;
task->value = value;
- strlcpy(task->name1, name1, MAXNAMELEN);
+ strlcpy(task->name1, name1, sizeof (task->name1));
if (name2 != NULL)
- strlcpy(task->name2, name2, MAXNAMELEN);
+ strlcpy(task->name2, name2, sizeof (task->name2));
return (task);
}
@@ -1516,7 +1517,7 @@ typedef struct zvol_set_prop_int_arg {
const char *zsda_name;
uint64_t zsda_value;
zprop_source_t zsda_source;
- dmu_tx_t *zsda_tx;
+ zfs_prop_t zsda_prop;
} zvol_set_prop_int_arg_t;
/*
@@ -1524,7 +1525,7 @@ typedef struct zvol_set_prop_int_arg {
* conditions are imposed.
*/
static int
-zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
+zvol_set_common_check(void *arg, dmu_tx_t *tx)
{
zvol_set_prop_int_arg_t *zsda = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
@@ -1540,104 +1541,34 @@ zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
return (error);
}
-/* ARGSUSED */
static int
-zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
- char dsname[MAXNAMELEN];
+ zvol_set_prop_int_arg_t *zsda = arg;
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
zvol_task_t *task;
- uint64_t snapdev;
+ uint64_t prop;
+ const char *prop_name = zfs_prop_to_name(zsda->zsda_prop);
dsl_dataset_name(ds, dsname);
- if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
- return (0);
- task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
- if (task == NULL)
- return (0);
-
- (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
- task, TQ_SLEEP);
- return (0);
-}
-
-/*
- * Traverse all child datasets and apply snapdev appropriately.
- * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
- * dataset and read the effective "snapdev" on every child in the callback
- * function: this is because the value is not guaranteed to be the same in the
- * whole dataset hierarchy.
- */
-static void
-zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
-{
- zvol_set_prop_int_arg_t *zsda = arg;
- dsl_pool_t *dp = dmu_tx_pool(tx);
- dsl_dir_t *dd;
- dsl_dataset_t *ds;
- int error;
- VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
- zsda->zsda_tx = tx;
+ if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
+ return (0);
- error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
- if (error == 0) {
- dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
- zsda->zsda_source, sizeof (zsda->zsda_value), 1,
- &zsda->zsda_value, zsda->zsda_tx);
- dsl_dataset_rele(ds, FTAG);
+ switch (zsda->zsda_prop) {
+ case ZFS_PROP_VOLMODE:
+ task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname,
+ NULL, prop);
+ break;
+ case ZFS_PROP_SNAPDEV:
+ task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
+ NULL, prop);
+ break;
+ default:
+ task = NULL;
+ break;
}
- dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
- zsda, DS_FIND_CHILDREN);
- dsl_dir_rele(dd, FTAG);
-}
-
-int
-zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
-{
- zvol_set_prop_int_arg_t zsda;
-
- zsda.zsda_name = ddname;
- zsda.zsda_source = source;
- zsda.zsda_value = snapdev;
-
- return (dsl_sync_task(ddname, zvol_set_snapdev_check,
- zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
-}
-
-/*
- * Sanity check the dataset for safe use by the sync task. No additional
- * conditions are imposed.
- */
-static int
-zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
-{
- zvol_set_prop_int_arg_t *zsda = arg;
- dsl_pool_t *dp = dmu_tx_pool(tx);
- dsl_dir_t *dd;
- int error;
-
- error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
- if (error != 0)
- return (error);
-
- dsl_dir_rele(dd, FTAG);
-
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
-{
- char dsname[MAXNAMELEN];
- zvol_task_t *task;
- uint64_t volmode;
-
- dsl_dataset_name(ds, dsname);
- if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
- return (0);
- task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
if (task == NULL)
return (0);
@@ -1647,14 +1578,14 @@ zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
}
/*
- * Traverse all child datasets and apply volmode appropriately.
+ * Traverse all child datasets and apply the property appropriately.
* We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
- * dataset and read the effective "volmode" on every child in the callback
+ * dataset and read the effective "property" on every child in the callback
* function: this is because the value is not guaranteed to be the same in the
* whole dataset hierarchy.
*/
static void
-zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
+zvol_set_common_sync(void *arg, dmu_tx_t *tx)
{
zvol_set_prop_int_arg_t *zsda = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
@@ -1663,33 +1594,34 @@ zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
int error;
VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
- zsda->zsda_tx = tx;
error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
if (error == 0) {
- dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop),
zsda->zsda_source, sizeof (zsda->zsda_value), 1,
- &zsda->zsda_value, zsda->zsda_tx);
+ &zsda->zsda_value, tx);
dsl_dataset_rele(ds, FTAG);
}
- dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
+ dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb,
zsda, DS_FIND_CHILDREN);
dsl_dir_rele(dd, FTAG);
}
int
-zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
+zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
+ uint64_t val)
{
zvol_set_prop_int_arg_t zsda;
zsda.zsda_name = ddname;
zsda.zsda_source = source;
- zsda.zsda_value = volmode;
+ zsda.zsda_value = val;
+ zsda.zsda_prop = prop;
- return (dsl_sync_task(ddname, zvol_set_volmode_check,
- zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
+ return (dsl_sync_task(ddname, zvol_set_common_check,
+ zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
}
void
@@ -1727,13 +1659,7 @@ boolean_t
zvol_is_zvol(const char *name)
{
- return (ops->zv_is_zvol(name));
-}
-
-void
-zvol_register_ops(const zvol_platform_ops_t *zvol_ops)
-{
- ops = zvol_ops;
+ return (zvol_os_is_zvol(name));
}
int