diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
132 files changed, 25472 insertions, 11066 deletions
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in deleted file mode 100644 index 653ea0da9bcc..000000000000 --- a/sys/contrib/openzfs/module/zfs/Makefile.in +++ /dev/null @@ -1,157 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -mfdir = $(obj) -else -mfdir = $(srctree)/$(src) -endif - -MODULE := zfs - -obj-$(CONFIG_ZFS) := $(MODULE).o - -# Suppress unused-value warnings in sparc64 architecture headers -ccflags-$(CONFIG_SPARC64) += -Wno-unused-value - -$(MODULE)-objs += abd.o -$(MODULE)-objs += aggsum.o -$(MODULE)-objs += arc.o -$(MODULE)-objs += blkptr.o -$(MODULE)-objs += bplist.o -$(MODULE)-objs += bpobj.o -$(MODULE)-objs += bptree.o -$(MODULE)-objs += btree.o -$(MODULE)-objs += bqueue.o -$(MODULE)-objs += dataset_kstats.o -$(MODULE)-objs += dbuf.o -$(MODULE)-objs += dbuf_stats.o -$(MODULE)-objs += ddt.o -$(MODULE)-objs += ddt_zap.o -$(MODULE)-objs += dmu.o -$(MODULE)-objs += dmu_diff.o -$(MODULE)-objs += dmu_object.o -$(MODULE)-objs += dmu_objset.o -$(MODULE)-objs += dmu_recv.o -$(MODULE)-objs += dmu_redact.o -$(MODULE)-objs += dmu_send.o -$(MODULE)-objs += dmu_traverse.o -$(MODULE)-objs += dmu_tx.o -$(MODULE)-objs += dmu_zfetch.o -$(MODULE)-objs += dnode.o -$(MODULE)-objs += dnode_sync.o -$(MODULE)-objs += dsl_bookmark.o -$(MODULE)-objs += dsl_crypt.o -$(MODULE)-objs += dsl_dataset.o -$(MODULE)-objs += dsl_deadlist.o -$(MODULE)-objs += dsl_deleg.o -$(MODULE)-objs += dsl_destroy.o -$(MODULE)-objs += dsl_dir.o -$(MODULE)-objs += dsl_pool.o -$(MODULE)-objs += dsl_prop.o -$(MODULE)-objs += dsl_scan.o -$(MODULE)-objs += dsl_synctask.o -$(MODULE)-objs += dsl_userhold.o -$(MODULE)-objs += edonr_zfs.o -$(MODULE)-objs += fm.o -$(MODULE)-objs += gzip.o -$(MODULE)-objs += hkdf.o -$(MODULE)-objs += lz4.o -$(MODULE)-objs += lzjb.o -$(MODULE)-objs += metaslab.o -$(MODULE)-objs += mmp.o -$(MODULE)-objs += multilist.o -$(MODULE)-objs += objlist.o -$(MODULE)-objs += pathname.o -$(MODULE)-objs += range_tree.o -$(MODULE)-objs += refcount.o -$(MODULE)-objs += rrwlock.o -$(MODULE)-objs += sa.o -$(MODULE)-objs += sha256.o -$(MODULE)-objs += skein_zfs.o -$(MODULE)-objs += spa.o -$(MODULE)-objs += spa_boot.o -$(MODULE)-objs += spa_checkpoint.o -$(MODULE)-objs += spa_config.o -$(MODULE)-objs += spa_errlog.o -$(MODULE)-objs += spa_history.o -$(MODULE)-objs += spa_log_spacemap.o -$(MODULE)-objs += spa_misc.o -$(MODULE)-objs += spa_stats.o -$(MODULE)-objs += space_map.o -$(MODULE)-objs += space_reftree.o -$(MODULE)-objs += txg.o -$(MODULE)-objs += uberblock.o -$(MODULE)-objs += unique.o -$(MODULE)-objs += vdev.o -$(MODULE)-objs += vdev_cache.o -$(MODULE)-objs += vdev_draid.o -$(MODULE)-objs += vdev_draid_rand.o -$(MODULE)-objs += vdev_indirect.o -$(MODULE)-objs += vdev_indirect_births.o -$(MODULE)-objs += vdev_indirect_mapping.o -$(MODULE)-objs += vdev_initialize.o -$(MODULE)-objs += vdev_label.o -$(MODULE)-objs += vdev_mirror.o -$(MODULE)-objs += vdev_missing.o -$(MODULE)-objs += vdev_queue.o -$(MODULE)-objs += vdev_raidz.o -$(MODULE)-objs += vdev_raidz_math.o -$(MODULE)-objs += vdev_raidz_math_scalar.o -$(MODULE)-objs += vdev_rebuild.o -$(MODULE)-objs += vdev_removal.o -$(MODULE)-objs += vdev_root.o -$(MODULE)-objs += vdev_trim.o -$(MODULE)-objs += zap.o -$(MODULE)-objs += zap_leaf.o -$(MODULE)-objs += zap_micro.o -$(MODULE)-objs += zcp.o -$(MODULE)-objs += zcp_get.o -$(MODULE)-objs += zcp_global.o -$(MODULE)-objs += zcp_iter.o -$(MODULE)-objs += zcp_set.o -$(MODULE)-objs += zcp_synctask.o -$(MODULE)-objs += zfeature.o -$(MODULE)-objs += zfs_byteswap.o -$(MODULE)-objs += zfs_fm.o -$(MODULE)-objs += zfs_fuid.o -$(MODULE)-objs += zfs_ioctl.o -$(MODULE)-objs += zfs_log.o -$(MODULE)-objs += zfs_onexit.o -$(MODULE)-objs += zfs_quota.o -$(MODULE)-objs += zfs_ratelimit.o -$(MODULE)-objs += zfs_replay.o -$(MODULE)-objs += zfs_rlock.o -$(MODULE)-objs += zfs_sa.o -$(MODULE)-objs += zfs_vnops.o -$(MODULE)-objs += zil.o -$(MODULE)-objs += zio.o -$(MODULE)-objs += zio_checksum.o -$(MODULE)-objs += zio_compress.o -$(MODULE)-objs += zio_inject.o -$(MODULE)-objs += zle.o -$(MODULE)-objs += zrlock.o -$(MODULE)-objs += zthr.o -$(MODULE)-objs += zvol.o - -# Suppress incorrect warnings from versions of objtool which are not -# aware of x86 EVEX prefix instructions used for AVX512. -OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y -OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y - -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o - -$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o -$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o - -$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o -$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o - -ifeq ($(CONFIG_ALTIVEC),y) -$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec -endif - -include $(mfdir)/../os/linux/zfs/Makefile diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index bf39cd613330..2c0cda25dbc6 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -109,7 +109,6 @@ void abd_verify(abd_t *abd) { #ifdef ZFS_DEBUG - ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | @@ -118,6 +117,7 @@ abd_verify(abd_t *abd) IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { + ASSERT3U(abd->abd_size, >, 0); ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); } else if (abd_is_gang(abd)) { uint_t child_sizes = 0; @@ -130,6 +130,7 @@ abd_verify(abd_t *abd) } ASSERT3U(abd->abd_size, ==, child_sizes); } else { + ASSERT3U(abd->abd_size, >, 0); abd_verify_scatter(abd); } #endif @@ -369,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) * will retain all the free_on_free settings after being * added to the parents list. */ +#ifdef ZFS_DEBUG + /* + * If cabd had abd_parent, we have to drop it here. We can't + * transfer it to pabd, nor we can clear abd_size leaving it. + */ + if (cabd->abd_parent != NULL) { + (void) zfs_refcount_remove_many( + &cabd->abd_parent->abd_children, + cabd->abd_size, cabd); + cabd->abd_parent = NULL; + } +#endif pabd->abd_size += cabd->abd_size; + cabd->abd_size = 0; list_move_tail(&ABD_GANG(pabd).abd_gang_chain, &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); @@ -407,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) */ if (abd_is_gang(cabd)) { ASSERT(!list_link_active(&cabd->abd_gang_link)); - ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); return (abd_gang_add_gang(pabd, cabd, free_on_free)); } ASSERT(!abd_is_gang(cabd)); @@ -667,15 +680,15 @@ abd_return_buf(abd_t *abd, void *buf, size_t n) { abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif if (abd_is_linear(abd)) { ASSERT3P(buf, ==, abd_to_buf(abd)); } else { ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } -#ifdef ZFS_DEBUG - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -#endif } void @@ -789,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - boolean_t gang = abd_is_gang(abd); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { - /* If we are at the end of the gang ABD we are done */ - if (gang && !c_abd) - break; + IMPLY(abd_is_gang(abd), c_abd != NULL); abd_iter_map(&aiter); @@ -816,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, return (ret); } +#if defined(__linux__) && defined(_KERNEL) +int +abd_iterate_page_func(abd_t *abd, size_t off, size_t size, + abd_iter_page_func_t *func, void *private) +{ + struct abd_iter aiter; + int ret = 0; + + if (size == 0) + return (0); + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); + + while (size > 0) { + IMPLY(abd_is_gang(abd), c_abd != NULL); + + abd_iter_page(&aiter); + + size_t len = MIN(aiter.iter_page_dsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_page, aiter.iter_page_doff, + len, private); + + aiter.iter_page = NULL; + aiter.iter_page_doff = 0; + aiter.iter_page_dsize = 0; + + if (ret != 0) + break; + + size -= len; + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); + } + + return (ret); +} +#endif + struct buf_arg { void *arg_buf; }; @@ -889,10 +941,10 @@ abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) &ba_ptr); } -/*ARGSUSED*/ static int abd_zero_off_cb(void *buf, size_t size, void *private) { + (void) private; (void) memset(buf, 0, size); return (0); } @@ -917,7 +969,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; - boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; if (size == 0) @@ -929,16 +980,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - dabd_is_gang_abd = abd_is_gang(dabd); - sabd_is_gang_abd = abd_is_gang(sabd); c_dabd = abd_init_abd_iter(dabd, &daiter, doff); c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { - /* if we are at the end of the gang ABD we are done */ - if ((dabd_is_gang_abd && !c_dabd) || - (sabd_is_gang_abd && !c_sabd)) - break; + IMPLY(abd_is_gang(dabd), c_dabd != NULL); + IMPLY(abd_is_gang(sabd), c_sabd != NULL); abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -967,10 +1014,10 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, return (ret); } -/*ARGSUSED*/ static int abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) { + (void) private; (void) memcpy(dbuf, sbuf, size); return (0); } @@ -985,10 +1032,10 @@ abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) abd_copy_off_cb, NULL); } -/*ARGSUSED*/ static int abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) { + (void) private; return (memcmp(bufa, bufb, size)); } @@ -1012,87 +1059,63 @@ abd_cmp(abd_t *dabd, abd_t *sabd) * is the same when taking linear and when taking scatter */ void -abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, + size_t csize, size_t dsize, const unsigned parity, void (*func_raidz_gen)(void **, const void *, size_t, size_t)) { int i; - ssize_t len, dlen; + size_t len, dlen; struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; - void *caddrs[3]; + struct abd_iter daiter; + void *caddrs[3], *daddr; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; - boolean_t cabds_is_gang_abd[3]; - boolean_t dabd_is_gang_abd = B_FALSE; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); - c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); + abd_verify(cabds[i]); + ASSERT3U(off + csize, <=, cabds[i]->abd_size); + c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off); } - if (dabd) { - dabd_is_gang_abd = abd_is_gang(dabd); - c_dabd = abd_init_abd_iter(dabd, &daiter, 0); + if (dsize > 0) { + ASSERT(dabd); + abd_verify(dabd); + ASSERT3U(off + dsize, <=, dabd->abd_size); + c_dabd = abd_init_abd_iter(dabd, &daiter, off); } - ASSERT3S(dsize, >=, 0); - abd_enter_critical(flags); while (csize > 0) { - /* if we are at the end of the gang ABD we are done */ - if (dabd_is_gang_abd && !c_dabd) - break; - + len = csize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we are - * done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; + len = MIN(caiters[i].iter_mapsize, len); } - len = csize; - - if (dabd && dsize > 0) + if (dsize > 0) { + IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - fallthrough; - case 2: - len = MIN(caiters[1].iter_mapsize, len); - fallthrough; - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ + daddr = daiter.iter_mapaddr; len = MIN(daiter.iter_mapsize, len); dlen = len; - } else + } else { + daddr = NULL; dlen = 0; + } /* must be progressive */ - ASSERT3S(len, >, 0); + ASSERT3U(len, >, 0); /* * The iterated function likely will not do well if each * segment except the last one is not multiple of 512 (raidz). */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + func_raidz_gen(caddrs, daddr, len, dlen); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); @@ -1101,7 +1124,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, &caiters[i], len); } - if (dabd && dsize > 0) { + if (dsize > 0) { abd_iter_unmap(&daiter); c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, @@ -1110,9 +1133,6 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, } csize -= len; - - ASSERT3S(dsize, >=, 0); - ASSERT3S(csize, >=, 0); } abd_exit_critical(flags); } @@ -1129,27 +1149,27 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, */ void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, + size_t tsize, const unsigned parity, void (*func_raidz_rec)(void **t, const size_t tsize, void **c, const unsigned *mul), const unsigned *mul) { int i; - ssize_t len; + size_t len; struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags __maybe_unused = 0; - boolean_t cabds_is_gang_abd[3]; - boolean_t tabds_is_gang_abd[3]; abd_t *c_cabds[3]; abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); - tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); + abd_verify(cabds[i]); + abd_verify(tabds[i]); + ASSERT3U(tsize, <=, cabds[i]->abd_size); + ASSERT3U(tsize, <=, tabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &citers[i], 0); c_tabds[i] = @@ -1158,36 +1178,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, abd_enter_critical(flags); while (tsize > 0) { - + len = tsize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we - * are done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; - if (tabds_is_gang_abd[i] && !c_tabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); + IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL); abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; + len = MIN(citers[i].iter_mapsize, len); + len = MIN(xiters[i].iter_mapsize, len); } - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - fallthrough; - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - fallthrough; - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } /* must be progressive */ ASSERT3S(len, >, 0); /* diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c index c4ea4f86fc5f..488c6ef3b6fc 100644 --- a/sys/contrib/openzfs/module/zfs/aggsum.c +++ b/sys/contrib/openzfs/module/zfs/aggsum.c @@ -87,7 +87,7 @@ static uint_t aggsum_borrow_shift = 4; void aggsum_init(aggsum_t *as, uint64_t value) { - bzero(as, sizeof (*as)); + memset(as, 0, sizeof (*as)); as->as_lower_bound = as->as_upper_bound = value; mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); /* diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 79e2d4381830..30d30b98a6c6 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -108,12 +108,11 @@ * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the - * arc_meta_limit is reached and no buffers can be safely evicted. In + * metadata limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so - * they can be reclaimed and the arc_meta_limit honored. For example, - * when using the ZPL each dentry holds a references on a znode. These - * dentries must be pruned before the arc buffer holding the znode can - * be safely evicted. + * they can be reclaimed. For example, when using the ZPL each dentry + * holds a references on a znode. These dentries must be pruned before + * the arc buffer holding the znode can be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. @@ -250,7 +249,7 @@ * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called - * with the transformed data and will bcopy the transformed on-disk block into + * with the transformed data and will memcpy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there @@ -328,9 +327,12 @@ static zthr_t *arc_reap_zthr; * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; +static arc_buf_hdr_t **arc_state_evict_markers; +static int arc_state_evict_marker_count; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; +static clock_t arc_last_uncached_flush; /* * Count of bytes evicted since boot. @@ -352,7 +354,7 @@ static list_t arc_evict_waiters; * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ -int zfs_arc_eviction_pct = 200; +static uint_t zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before @@ -361,24 +363,21 @@ int zfs_arc_eviction_pct = 200; * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ -int zfs_arc_evict_batch_limit = 10; +static uint_t zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ -int arc_grow_retry = 5; +uint_t arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ -int arc_kmem_cache_reap_retry_ms = 1000; +static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ -int zfs_arc_overflow_shift = 8; - -/* shift of arc_c for calculating both min and max arc_p */ -int arc_p_min_shift = 4; +static int zfs_arc_overflow_shift = 8; /* log2(fraction of arc to reclaim) */ -int arc_shrink_shift = 7; +uint_t arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL @@ -394,20 +393,20 @@ uint_t zfs_arc_pc_percent = 0; * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ -int arc_no_grow_shift = 5; +uint_t arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ -static int arc_min_prefetch_ms; -static int arc_min_prescient_prefetch_ms; +static uint_t arc_min_prefetch_ms; +static uint_t arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ -int arc_lotsfree_percent = 10; +uint_t arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. @@ -417,23 +416,23 @@ boolean_t arc_warm; /* * These tunables are for performance analysis. */ -unsigned long zfs_arc_max = 0; -unsigned long zfs_arc_min = 0; -unsigned long zfs_arc_meta_limit = 0; -unsigned long zfs_arc_meta_min = 0; -unsigned long zfs_arc_dnode_limit = 0; -unsigned long zfs_arc_dnode_reduce_percent = 10; -int zfs_arc_grow_retry = 0; -int zfs_arc_shrink_shift = 0; -int zfs_arc_p_min_shift = 0; -int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +uint64_t zfs_arc_max = 0; +uint64_t zfs_arc_min = 0; +static uint64_t zfs_arc_dnode_limit = 0; +static uint_t zfs_arc_dnode_reduce_percent = 10; +static uint_t zfs_arc_grow_retry = 0; +static uint_t zfs_arc_shrink_shift = 0; +uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* - * ARC dirty data constraints for arc_tempreserve_space() throttle. + * ARC dirty data constraints for arc_tempreserve_space() throttle: + * * total dirty data limit + * * anon block dirty limit + * * each pool's anon allowance */ -unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ -unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ -unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ +static const unsigned long zfs_arc_dirty_limit_percent = 50; +static const unsigned long zfs_arc_anon_limit_percent = 25; +static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. @@ -441,51 +440,60 @@ unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ int zfs_compressed_arc_enabled = B_TRUE; /* - * ARC will evict meta buffers that exceed arc_meta_limit. This - * tunable make arc_meta_limit adjustable for different workloads. + * Balance between metadata and data on ghost hits. Values above 100 + * increase metadata caching by proportionally reducing effect of ghost + * data hits on target data/metadata rate. */ -unsigned long zfs_arc_meta_limit_percent = 75; +static uint_t zfs_arc_meta_balance = 500; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ -unsigned long zfs_arc_dnode_limit_percent = 10; +static uint_t zfs_arc_dnode_limit_percent = 10; + +/* + * These tunables are Linux-specific + */ +static uint64_t zfs_arc_sys_free = 0; +static uint_t zfs_arc_min_prefetch_ms = 0; +static uint_t zfs_arc_min_prescient_prefetch_ms = 0; +static uint_t zfs_arc_lotsfree_percent = 10; /* - * These tunables are Linux specific + * Number of arc_prune threads */ -unsigned long zfs_arc_sys_free = 0; -int zfs_arc_min_prefetch_ms = 0; -int zfs_arc_min_prescient_prefetch_ms = 0; -int zfs_arc_p_dampener_disable = 1; -int zfs_arc_meta_prune = 10000; -int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; -int zfs_arc_meta_adjust_restarts = 4096; -int zfs_arc_lotsfree_percent = 10; +static int zfs_arc_prune_task_threads = 1; -/* The 6 states: */ +/* The 7 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; +arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, + { "iohits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_iohits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_iohits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_iohits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "uncached_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, @@ -502,7 +510,9 @@ arc_stats_t arc_stats = { { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, - { "p", KSTAT_DATA_UINT64 }, + { "meta", KSTAT_DATA_UINT64 }, + { "pd", KSTAT_DATA_UINT64 }, + { "pm", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, @@ -520,20 +530,35 @@ arc_stats_t arc_stats = { { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_data", KSTAT_DATA_UINT64 }, + { "anon_metadata", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_data", KSTAT_DATA_UINT64 }, + { "mru_metadata", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_data", KSTAT_DATA_UINT64 }, + { "mfu_metadata", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, + { "uncached_size", KSTAT_DATA_UINT64 }, + { "uncached_data", KSTAT_DATA_UINT64 }, + { "uncached_metadata", KSTAT_DATA_UINT64 }, + { "uncached_evictable_data", KSTAT_DATA_UINT64 }, + { "uncached_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, @@ -586,13 +611,14 @@ arc_stats_t arc_stats = { { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, - { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, - { "arc_meta_max", KSTAT_DATA_UINT64 }, - { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, + { "predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "prescient_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, + { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, @@ -646,7 +672,7 @@ arc_sums_t arc_sums; ARCSTAT(stat) = x; \ } while (0) -kstat_t *arc_ksp; +static kstat_t *arc_ksp; /* * There are several ARC variables that are critical to export as kstats -- @@ -658,10 +684,7 @@ kstat_t *arc_ksp; */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -/* max size for dnodes */ -#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; @@ -683,6 +706,7 @@ taskq_t *arc_prune_taskq; ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) @@ -724,8 +748,7 @@ taskq_t *arc_prune_taskq; * Other sizes */ -#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* @@ -753,8 +776,8 @@ uint64_t zfs_crc64_table[256]; * Level 2 ARC */ -#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 8 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost @@ -771,16 +794,16 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ -unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ -unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ -unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ -int l2arc_meta_percent = 33; /* limit on headers size */ +static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals @@ -833,21 +856,24 @@ static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, - ARC_HDR_DO_ADAPT = 0x2, ARC_HDR_USE_RESERVE = 0x4, + ARC_HDR_ALLOC_LINEAR = 0x8, }; -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, int); -static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, int); -static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); -static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); -static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int); +static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *); +static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, + const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); -static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static void arc_hdr_destroy(arc_buf_hdr_t *); +static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t); static void arc_buf_watch(arc_buf_t *); +static void arc_change_state(arc_state_t *, arc_buf_hdr_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -860,6 +886,8 @@ static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); +static void arc_prune_async(uint64_t adjust); + #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ @@ -881,7 +909,7 @@ int l2arc_exclude_special = 0; * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. */ -int l2arc_mfuonly = 0; +static int l2arc_mfuonly = 0; /* * L2ARC TRIM @@ -898,7 +926,7 @@ int l2arc_mfuonly = 0; * will vary depending of how well the specific device handles * these commands. */ -unsigned long l2arc_trim_ahead = 0; +static uint64_t l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: @@ -913,12 +941,12 @@ unsigned long l2arc_trim_ahead = 0; * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ -int l2arc_rebuild_enabled = B_TRUE; -unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; +static int l2arc_rebuild_enabled = B_TRUE; +static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); -static void l2arc_dev_rebuild_thread(void *arg); +static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ @@ -938,7 +966,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ @@ -986,7 +1014,7 @@ static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t birth = BP_PHYSICAL_BIRTH(bp); + uint64_t birth = BP_GET_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; @@ -1086,15 +1114,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) */ static kmem_cache_t *hdr_full_cache; -static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { - int i; - #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages @@ -1106,10 +1131,9 @@ buf_fini(void) kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif - for (i = 0; i < BUF_LOCKS; i++) + for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); - kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -1118,58 +1142,44 @@ buf_fini(void) * Constructor callback - called when the cache is empty * and a new buf is requested. */ -/* ARGSUSED */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { + (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; - bzero(hdr, HDR_FULL_SIZE); + memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); +#ifdef ZFS_DEBUG mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&hdr->b_l1hdr.b_arc_node); - list_link_init(&hdr->b_l2hdr.b_l2node); +#endif multilist_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } -/* ARGSUSED */ -static int -hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) -{ - arc_buf_hdr_t *hdr = vbuf; - - hdr_full_cons(vbuf, unused, kmflag); - bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr)); - arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); - - return (0); -} - -/* ARGSUSED */ static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { + (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; - bzero(hdr, HDR_L2ONLY_SIZE); + memset(hdr, 0, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } -/* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { + (void) unused, (void) kmflag; arc_buf_t *buf = vbuf; - bzero(buf, sizeof (arc_buf_t)); - mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + memset(buf, 0, sizeof (arc_buf_t)); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -1179,47 +1189,37 @@ buf_cons(void *vbuf, void *unused, int kmflag) * Destructor callback - called when a cached buf is * no longer required. */ -/* ARGSUSED */ static void hdr_full_dest(void *vbuf, void *unused) { + (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); - cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); +#ifdef ZFS_DEBUG mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); +#endif ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } -/* ARGSUSED */ -static void -hdr_full_crypt_dest(void *vbuf, void *unused) -{ - arc_buf_hdr_t *hdr = vbuf; - - hdr_full_dest(vbuf, unused); - arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); -} - -/* ARGSUSED */ static void hdr_l2only_dest(void *vbuf, void *unused) { - arc_buf_hdr_t *hdr __maybe_unused = vbuf; + (void) unused; + arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } -/* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { - arc_buf_t *buf = vbuf; + (void) unused; + (void) vbuf; - mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } @@ -1259,9 +1259,6 @@ retry: hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); - hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", - HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, - NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); @@ -1324,9 +1321,9 @@ arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, ASSERT(HDR_PROTECTED(hdr)); - bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); - bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); - bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); + memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } @@ -1369,7 +1366,7 @@ arc_buf_is_shared(arc_buf_t *buf) abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); - IMPLY(shared, ARC_BUF_SHARED(buf)); + EQUIV(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* @@ -1387,6 +1384,7 @@ arc_buf_is_shared(arc_buf_t *buf) static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { +#ifdef ZFS_DEBUG ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); @@ -1395,6 +1393,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr) hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +#endif } /* @@ -1423,6 +1422,7 @@ arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) static void arc_cksum_verify(arc_buf_t *buf) { +#ifdef ZFS_DEBUG arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; @@ -1445,6 +1445,7 @@ arc_cksum_verify(arc_buf_t *buf) if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +#endif } /* @@ -1485,14 +1486,13 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) static void arc_cksum_compute(arc_buf_t *buf) { - arc_buf_hdr_t *hdr = buf->b_hdr; - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; +#ifdef ZFS_DEBUG + arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_HAS_L1HDR(hdr)); - - mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; @@ -1505,6 +1505,7 @@ arc_cksum_compute(arc_buf_t *buf) fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +#endif arc_buf_watch(buf); } @@ -1512,11 +1513,11 @@ arc_cksum_compute(arc_buf_t *buf) void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { + (void) sig, (void) unused; panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif -/* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { @@ -1525,10 +1526,11 @@ arc_buf_unwatch(arc_buf_t *buf) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } +#else + (void) buf; #endif } -/* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { @@ -1536,6 +1538,8 @@ arc_buf_watch(arc_buf_t *buf) if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); +#else + (void) buf; #endif } @@ -1681,18 +1685,20 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) } if (!ARC_BUF_COMPRESSED(from)) { - bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); + memcpy(buf->b_data, from->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } +#ifdef ZFS_DEBUG /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); +#endif return (copied); } @@ -1778,12 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - tmpbuf = zio_buf_alloc(lsize); - abd = abd_get_from_buf(tmpbuf, lsize); - abd_take_ownership_of_buf(abd, B_TRUE); + csize = zio_compress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel); + hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel); + ASSERT3P(tmpbuf, !=, NULL); ASSERT3U(csize, <=, psize); + abd = abd_get_from_buf(tmpbuf, lsize); + abd_take_ownership_of_buf(abd, B_TRUE); abd_zero_off(abd, csize, psize - csize); } @@ -1836,7 +1843,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, @@ -1863,8 +1870,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -1947,20 +1953,19 @@ error: * arc_buf_fill(). */ static void -arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) +arc_buf_untransform_in_place(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; - hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* @@ -1995,7 +2000,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); - IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + IMPLY(encrypted, !arc_buf_is_shared(buf)); /* * If the caller wanted encrypted data we just need to copy it from @@ -2051,7 +2056,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, if (hash_lock != NULL) mutex_enter(hash_lock); - arc_buf_untransform_in_place(buf, hash_lock); + arc_buf_untransform_in_place(buf); if (hash_lock != NULL) mutex_exit(hash_lock); @@ -2063,21 +2068,23 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, } if (hdr_compressed == compressed) { - if (!arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { + ASSERT(arc_buf_is_shared(buf)); + } else { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); - ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ - if (arc_buf_is_shared(buf)) { - ASSERT(ARC_BUF_COMPRESSED(buf)); + if (ARC_BUF_SHARED(buf)) { + ASSERTF(ARC_BUF_COMPRESSED(buf), + "buf %p was uncompressed", buf); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; @@ -2088,6 +2095,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT(!arc_buf_is_shared(buf)); + /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); @@ -2175,7 +2184,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); - spa_log_error(spa, zb); + spa_log_error(spa, zb, buf->b_hdr->b_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } @@ -2196,7 +2205,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2216,7 +2224,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2236,7 +2244,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2256,7 +2263,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); @@ -2270,33 +2277,22 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) * it is not evictable. */ static void -add_reference(arc_buf_hdr_t *hdr, void *tag) +add_reference(arc_buf_hdr_t *hdr, const void *tag) { - arc_state_t *state; + arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } - state = hdr->b_l1hdr.b_state; - if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && - (state != arc_anon)) { + state != arc_anon && state != arc_l2c_only) { /* We don't use the L2-only state list. */ - if (state != arc_l2c_only) { - multilist_remove(&state->arcs_list[arc_buf_type(hdr)], - hdr); - arc_evictable_space_decrement(hdr, state); - } - /* remove the prefetch flag if we get a reference */ - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); + arc_evictable_space_decrement(hdr, state); } } @@ -2306,26 +2302,30 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) * list making it eligible for eviction. */ static int -remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +remove_reference(arc_buf_hdr_t *hdr, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); - ASSERT(!GHOST_STATE(state)); + ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */ - /* - * arc_l2c_only counts as a ghost state so we don't need to explicitly - * check to prevent usage of the arc_l2c_only list. - */ - if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && - (state != arc_anon)) { - multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); - arc_evictable_space_increment(hdr, state); + if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0) + return (cnt); + + if (state == arc_anon) { + arc_hdr_destroy(hdr); + return (0); + } + if (state == arc_uncached && !HDR_PREFETCH(hdr)) { + arc_change_state(arc_anon, hdr); + arc_hdr_destroy(hdr); + return (0); } - return (cnt); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + arc_evictable_space_increment(hdr, state); + return (0); } /* @@ -2338,6 +2338,7 @@ remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { + (void) state_index; arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; @@ -2358,7 +2359,9 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) l2hdr = &hdr->b_l2hdr; if (l1hdr) { - abi->abi_bufcnt = l1hdr->b_bufcnt; + abi->abi_bufcnt = 0; + for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next) + abi->abi_bufcnt++; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; @@ -2382,14 +2385,12 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, - kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) { arc_state_t *old_state; int64_t refcnt; - uint32_t bufcnt; boolean_t update_old, update_new; - arc_buf_contents_t buftype = arc_buf_type(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() @@ -2401,21 +2402,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); - bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || - HDR_HAS_RABD(hdr)); + update_old = (hdr->b_l1hdr.b_buf != NULL || + hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + + IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL); + IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL); + IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL || + ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); } else { old_state = arc_l2c_only; refcnt = 0; - bufcnt = 0; update_old = B_FALSE; } update_new = update_old; + if (GHOST_STATE(old_state)) + update_old = B_TRUE; + if (GHOST_STATE(new_state)) + update_new = B_TRUE; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(new_state, !=, old_state); - ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); - ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the @@ -2424,14 +2430,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_remove(&old_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_old = B_TRUE; + /* remove_reference() saves on insert. */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + multilist_remove(&old_state->arcs_list[type], + hdr); + arc_evictable_space_decrement(hdr, old_state); } - arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* @@ -2441,13 +2445,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); - multilist_insert(&new_state->arcs_list[buftype], hdr); - - if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - update_new = B_TRUE; - } + multilist_insert(&new_state->arcs_list[type], hdr); arc_evictable_space_increment(hdr, new_state); } } @@ -2461,21 +2459,19 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { - ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first - * remove all arc buffers. Thus, we'll have a - * bufcnt of zero, and no arc buffer to use for - * the reference. As a result, we use the arc - * header pointer for the reference. + * remove all arc buffers. Thus, we'll have no arc + * buffer to use for the reference. As a result, we + * use the arc header pointer for the reference. */ - (void) zfs_refcount_add_many(&new_state->arcs_size, + (void) zfs_refcount_add_many( + &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2484,8 +2480,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data @@ -2494,24 +2488,23 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( - &new_state->arcs_size, + &new_state->arcs_size[type], HDR_GET_PSIZE(hdr), hdr); } } @@ -2520,7 +2513,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { - ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); @@ -2532,10 +2524,10 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * header on the ghost state. */ - (void) zfs_refcount_remove_many(&old_state->arcs_size, + (void) zfs_refcount_remove_many( + &old_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); } else { - uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, @@ -2544,8 +2536,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - ASSERT3U(bufcnt, !=, 0); - buffers++; /* * When the arc_buf_t is sharing the data @@ -2554,27 +2544,26 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, * add to the refcount if the arc_buf_t is * not shared. */ - if (arc_buf_is_shared(buf)) + if (ARC_BUF_SHARED(buf)) continue; (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_buf_size(buf), - buf); + &old_state->arcs_size[type], + arc_buf_size(buf), buf); } - ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), - hdr); + &old_state->arcs_size[type], + arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( - &old_state->arcs_size, HDR_GET_PSIZE(hdr), - hdr); + &old_state->arcs_size[type], + HDR_GET_PSIZE(hdr), hdr); } } } @@ -2608,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - aggsum_add(&arc_sums.arcstat_dnode_size, space); + ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2631,7 +2620,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) - aggsum_add(&arc_sums.arcstat_meta_used, space); + ARCSTAT_INCR(arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } @@ -2654,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - aggsum_add(&arc_sums.arcstat_dnode_size, -space); + ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -2670,13 +2659,8 @@ arc_space_return(uint64_t space, arc_space_type_t type) break; } - if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { - ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used, - space) >= 0); - ARCSTAT_MAX(arcstat_meta_max, - aggsum_upper_bound(&arc_sums.arcstat_meta_used)); - aggsum_add(&arc_sums.arcstat_meta_used, -space); - } + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) + ARCSTAT_INCR(arcstat_meta_used, -space); ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); @@ -2729,8 +2713,8 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, - void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, - boolean_t fill, arc_buf_t **ret) + const void *tag, boolean_t encrypted, boolean_t compressed, + boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; @@ -2814,9 +2798,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; - hdr->b_l1hdr.b_bufcnt += 1; - if (encrypted) - hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or @@ -2830,7 +2811,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, return (0); } -static char *arc_onloan_tag = "onloan"; +static const char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) @@ -2889,7 +2870,7 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, * Return a loaned arc buffer to the arc. */ void -arc_return_buf(arc_buf_t *buf, void *tag) +arc_return_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -2903,7 +2884,7 @@ arc_return_buf(arc_buf_t *buf, void *tag) /* Detach an arc_buf from a dbuf (tag) */ void -arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +arc_loan_inuse_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -2943,7 +2924,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { @@ -2976,7 +2957,8 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, @@ -3005,7 +2987,8 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ - zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + zfs_refcount_transfer_ownership_many( + &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)], arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); @@ -3056,8 +3039,6 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); - IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); @@ -3087,31 +3068,30 @@ arc_buf_destroy_impl(arc_buf_t *buf) arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { + ASSERT(!arc_buf_is_shared(buf)); uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); - hdr->b_l1hdr.b_bufcnt -= 1; - - if (ARC_BUF_ENCRYPTED(buf)) { - hdr->b_crypt_hdr.b_ebufcnt -= 1; - - /* - * If we have no more encrypted buffers and we've - * already gotten a copy of the decrypted data we can - * free b_rabd to save some space. - */ - if (hdr->b_crypt_hdr.b_ebufcnt == 0 && - HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && - !HDR_IO_IN_PROGRESS(hdr)) { - arc_hdr_free_abd(hdr, B_TRUE); + /* + * If we have no more encrypted buffers and we've already + * gotten a copy of the decrypted data we can free b_rabd + * to save some space. + */ + if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) && + hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *b; + for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) { + if (b != buf && ARC_BUF_ENCRYPTED(b)) + break; } + if (b == NULL) + arc_hdr_free_abd(hdr, B_TRUE); } } @@ -3132,9 +3112,9 @@ arc_buf_destroy_impl(arc_buf_t *buf) */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ - VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + ASSERT(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); @@ -3272,14 +3252,12 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); - if (protected) { - hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); - } else { - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); - } + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); +#ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); +#endif HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; @@ -3297,7 +3275,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -3323,24 +3300,14 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); - /* - * if the caller wanted a new full header and the header is to be - * encrypted we will actually allocate the header from the full crypt - * cache instead. The same applies to freeing from the old cache. - */ - if (HDR_PROTECTED(hdr) && new == hdr_full_cache) - new = hdr_full_crypt_cache; - if (HDR_PROTECTED(hdr) && old == hdr_full_cache) - old = hdr_full_crypt_cache; - nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); - bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); - if (new == hdr_full_cache || new == hdr_full_crypt_cache) { + if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a @@ -3354,8 +3321,9 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(hdr->b_l1hdr.b_bufcnt); +#ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); +#endif /* * If we've reached here, We must have been called from @@ -3419,125 +3387,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) } /* - * This function allows an L1 header to be reallocated as a crypt - * header and vice versa. If we are going to a crypt header, the - * new fields will be zeroed out. - */ -static arc_buf_hdr_t * -arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) -{ - arc_buf_hdr_t *nhdr; - arc_buf_t *buf; - kmem_cache_t *ncache, *ocache; - - /* - * This function requires that hdr is in the arc_anon state. - * Therefore it won't have any L2ARC data for us to worry - * about copying. - */ - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); - ASSERT3P(hdr->b_hash_next, ==, NULL); - - if (need_crypt) { - ncache = hdr_full_crypt_cache; - ocache = hdr_full_cache; - } else { - ncache = hdr_full_cache; - ocache = hdr_full_crypt_cache; - } - - nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); - - /* - * Copy all members that aren't locks or condvars to the new header. - * No lists are pointing to us (as we asserted above), so we don't - * need to worry about the list nodes. - */ - nhdr->b_dva = hdr->b_dva; - nhdr->b_birth = hdr->b_birth; - nhdr->b_type = hdr->b_type; - nhdr->b_flags = hdr->b_flags; - nhdr->b_psize = hdr->b_psize; - nhdr->b_lsize = hdr->b_lsize; - nhdr->b_spa = hdr->b_spa; - nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; - nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; - nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; - nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; - nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; - nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; - nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; - nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; - nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; - nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; - nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; - - /* - * This zfs_refcount_add() exists only to ensure that the individual - * arc buffers always point to a header that is referenced, avoiding - * a small race condition that could trigger ASSERTs. - */ - (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); - nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; - for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = nhdr; - mutex_exit(&buf->b_evict_lock); - } - - zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); - (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); - - if (need_crypt) { - arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); - } else { - arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); - } - - /* unset all members of the original hdr */ - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_type = ARC_BUFC_INVALID; - hdr->b_flags = 0; - hdr->b_psize = 0; - hdr->b_lsize = 0; - hdr->b_spa = 0; - hdr->b_l1hdr.b_freeze_cksum = NULL; - hdr->b_l1hdr.b_buf = NULL; - hdr->b_l1hdr.b_bufcnt = 0; - hdr->b_l1hdr.b_byteswap = 0; - hdr->b_l1hdr.b_state = NULL; - hdr->b_l1hdr.b_arc_access = 0; - hdr->b_l1hdr.b_mru_hits = 0; - hdr->b_l1hdr.b_mru_ghost_hits = 0; - hdr->b_l1hdr.b_mfu_hits = 0; - hdr->b_l1hdr.b_mfu_ghost_hits = 0; - hdr->b_l1hdr.b_acb = NULL; - hdr->b_l1hdr.b_pabd = NULL; - - if (ocache == hdr_full_crypt_cache) { - ASSERT(!HDR_HAS_RABD(hdr)); - hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; - hdr->b_crypt_hdr.b_ebufcnt = 0; - hdr->b_crypt_hdr.b_dsobj = 0; - bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); - bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); - bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); - } - - buf_discard_identity(hdr); - kmem_cache_free(ocache, hdr); - - return (nhdr); -} - -/* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering @@ -3556,8 +3405,7 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); - if (!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? @@ -3566,11 +3414,11 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, arc_cksum_free(hdr); if (salt != NULL) - bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); if (iv != NULL) - bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); if (mac != NULL) - bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); + memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); } /* @@ -3578,7 +3426,8 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * -arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) +arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, + int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type); @@ -3596,8 +3445,8 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) * for bufs containing metadata. */ arc_buf_t * -arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, - enum zio_compress compression_type, uint8_t complevel) +arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, + uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); @@ -3611,7 +3460,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * To ensure that the hdr has the correct data in it if we call @@ -3624,9 +3472,9 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, } arc_buf_t * -arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, - const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, - dmu_object_type_t ot, uint64_t psize, uint64_t lsize, +arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, + boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; @@ -3646,9 +3494,9 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); - bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); - bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); - bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); + memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); + memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); + memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an @@ -3659,7 +3507,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); return (buf); } @@ -3759,8 +3606,6 @@ static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_buf == NULL || - hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } @@ -3821,27 +3666,25 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - - if (!HDR_PROTECTED(hdr)) { - kmem_cache_free(hdr_full_cache, hdr); - } else { - kmem_cache_free(hdr_full_crypt_cache, hdr); - } +#ifdef ZFS_DEBUG + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); +#endif + kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void -arc_buf_destroy(arc_buf_t *buf, void* tag) +arc_buf_destroy(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - VERIFY0(remove_reference(hdr, NULL, tag)); - arc_hdr_destroy(hdr); + VERIFY0(remove_reference(hdr, tag)); return; } @@ -3849,13 +3692,13 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); - (void) remove_reference(hdr, hash_lock, tag); arc_buf_destroy_impl(buf); + (void) remove_reference(hdr, tag); mutex_exit(hash_lock); } @@ -3870,6 +3713,7 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted + * - arc_uncached -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction @@ -3881,21 +3725,22 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) * only the evicted headers size. */ static int64_t -arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) +arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; - int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? + uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion @@ -3921,49 +3766,34 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ - arc_change_state(arc_l2c_only, hdr, hash_lock); + arc_change_state(arc_l2c_only, hdr); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, + (void) arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } - ASSERT(state == arc_mru || state == arc_mfu); - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached); + evicted_state = (state == arc_uncached) ? arc_anon : + ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost); /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(hdr) || - ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - MSEC_TO_TICK(min_lifetime))) { + MSEC_TO_TICK(min_lifetime)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); - while (hdr->b_l1hdr.b_buf) { - arc_buf_t *buf = hdr->b_l1hdr.b_buf; - if (!mutex_tryenter(&buf->b_evict_lock)) { - ARCSTAT_BUMP(arcstat_mutex_miss); - break; - } - if (buf->b_data != NULL) { - bytes_evicted += HDR_GET_LSIZE(hdr); - *real_evicted += HDR_GET_LSIZE(hdr); - } - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy_impl(buf); - } - if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { @@ -3991,28 +3821,27 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) } } - if (hdr->b_l1hdr.b_bufcnt == 0) { - arc_cksum_free(hdr); - - bytes_evicted += arc_hdr_size(hdr); - *real_evicted += arc_hdr_size(hdr); + bytes_evicted += arc_hdr_size(hdr); + *real_evicted += arc_hdr_size(hdr); - /* - * If this hdr is being evicted and has a compressed - * buffer then we discard it here before we change states. - * This ensures that the accounting is updated correctly - * in arc_free_data_impl(). - */ - if (hdr->b_l1hdr.b_pabd != NULL) - arc_hdr_free_abd(hdr, B_FALSE); + /* + * If this hdr is being evicted and has a compressed buffer then we + * discard it here before we change states. This ensures that the + * accounting is updated correctly in arc_free_data_impl(). + */ + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_abd(hdr, B_FALSE); - if (HDR_HAS_RABD(hdr)) - arc_hdr_free_abd(hdr, B_TRUE); + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); - arc_change_state(evicted_state, hdr, hash_lock); + arc_change_state(evicted_state, hdr); + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + if (evicted_state == arc_anon) { + arc_hdr_destroy(hdr); + *real_evicted += HDR_FULL_SIZE; + } else { ASSERT(HDR_IN_HASH_TABLE(hdr)); - arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); @@ -4040,15 +3869,15 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - int evict_count = zfs_arc_evict_batch_limit; + uint_t evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); - mls = multilist_sublist_lock(ml, idx); + mls = multilist_sublist_lock_idx(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { - if ((evict_count <= 0) || (bytes_evicted >= bytes)) + if ((evict_count == 0) || (bytes_evicted >= bytes)) break; /* @@ -4097,8 +3926,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, if (mutex_tryenter(hash_lock)) { uint64_t revicted; - uint64_t evicted = arc_evict_hdr(hdr, hash_lock, - &revicted); + uint64_t evicted = arc_evict_hdr(hdr, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; @@ -4152,11 +3980,54 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * this CPU are able to make progress, make a voluntary preemption * call here. */ - cond_resched(); + kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } +static arc_buf_hdr_t * +arc_state_alloc_marker(void) +{ + arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_evict_state_impl(). + */ + marker->b_spa = 0; + + return (marker); +} + +static void +arc_state_free_marker(arc_buf_hdr_t *marker) +{ + kmem_cache_free(hdr_full_cache, marker); +} + +/* + * Allocate an array of buffer headers used as placeholders during arc state + * eviction. + */ +static arc_buf_hdr_t ** +arc_state_alloc_markers(int count) +{ + arc_buf_hdr_t **markers; + + markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); + for (int i = 0; i < count; i++) + markers[i] = arc_state_alloc_marker(); + return (markers); +} + +static void +arc_state_free_markers(arc_buf_hdr_t **markers, int count) +{ + for (int i = 0; i < count; i++) + arc_state_free_marker(markers[i]); + kmem_free(markers, sizeof (*markers) * count); +} + /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the @@ -4171,8 +4042,8 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * the given arc state; which is used by arc_flush(). */ static uint64_t -arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, - arc_buf_contents_t type) +arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa, + uint64_t bytes) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; @@ -4188,20 +4059,16 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ - markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + if (zthr_iscurthread(arc_evict_zthr)) { + markers = arc_state_evict_markers; + ASSERT3S(num_sublists, <=, arc_state_evict_marker_count); + } else { + markers = arc_state_alloc_markers(num_sublists); + } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; - markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - - /* - * A b_spa of 0 is used to indicate that this header is - * a marker. This fact is used in arc_evict_type() and - * arc_evict_state_impl(). - */ - markers[i]->b_spa = 0; - - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } @@ -4215,19 +4082,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, uint64_t scan_evicted = 0; /* - * Try to reduce pinned dnodes with a floor of arc_dnode_limit. - * Request that 10% of the LRUs be scanned by the superblock - * shrinker. - */ - if (type == ARC_BUFC_DATA && aggsum_compare( - &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) { - arc_prune_async((aggsum_upper_bound( - &arc_sums.arcstat_dnode_size) - - arc_dnode_size_limit) / sizeof (dnode_t) / - zfs_arc_dnode_reduce_percent); - } - - /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist @@ -4279,13 +4133,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, } for (int i = 0; i < num_sublists; i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); - - kmem_cache_free(hdr_full_cache, markers[i]); } - kmem_free(markers, sizeof (*markers) * num_sublists); + if (markers != arc_state_evict_markers) + arc_state_free_markers(markers, num_sublists); return (total_evicted); } @@ -4312,7 +4165,7 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { - evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL); if (!retry) break; @@ -4322,252 +4175,64 @@ arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, } /* - * Evict the specified number of bytes from the state specified, - * restricting eviction to the spa and type given. This function - * prevents us from trying to evict more from a state's list than - * is "evictable", and to skip evicting altogether when passed a + * Evict the specified number of bytes from the state specified. This + * function prevents us from trying to evict more from a state's list + * than is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t -arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) +arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); - return (arc_evict_state(state, spa, delta, type)); + return (arc_evict_state(state, type, 0, delta)); } return (0); } /* - * The goal of this function is to evict enough meta data buffers from the - * ARC in order to enforce the arc_meta_limit. Achieving this is slightly - * more complicated than it appears because it is common for data buffers - * to have holds on meta data buffers. In addition, dnode meta data buffers - * will be held by the dnodes in the block preventing them from being freed. - * This means we can't simply traverse the ARC and expect to always find - * enough unheld meta data buffer to release. - * - * Therefore, this function has been updated to make alternating passes - * over the ARC releasing data buffers and then newly unheld meta data - * buffers. This ensures forward progress is maintained and meta_used - * will decrease. Normally this is sufficient, but if required the ARC - * will call the registered prune callbacks causing dentry and inodes to - * be dropped from the VFS cache. This will make dnode meta data buffers - * available for reclaim. + * Adjust specified fraction, taking into account initial ghost state(s) size, + * ghost hit bytes towards increasing the fraction, ghost hit bytes towards + * decreasing it, plus a balance factor, controlling the decrease rate, used + * to balance metadata vs data. */ static uint64_t -arc_evict_meta_balanced(uint64_t meta_used) +arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down, + uint_t balance) { - int64_t delta, prune = 0, adjustmnt; - uint64_t total_evicted = 0; - arc_buf_contents_t type = ARC_BUFC_DATA; - int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); + if (total < 8 || up + down == 0) + return (frac); -restart: /* - * This slightly differs than the way we evict from the mru in - * arc_evict because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. + * We should not have more ghost hits than ghost size, but they + * may get close. Restrict maximum adjustment in that case. */ - adjustmnt = meta_used - arc_meta_limit; - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mru, 0, delta, type); - adjustmnt -= delta; + if (up + down >= total / 4) { + uint64_t scale = (up + down) / (total / 8); + up /= scale; + down /= scale; } - /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. - */ + /* Get maximal dynamic range by choosing optimal shifts. */ + int s = highbit64(total); + s = MIN(64 - s, 32); - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { - delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), - adjustmnt); - total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); - } + uint64_t ofrac = (1ULL << 32) - frac; - adjustmnt = meta_used - arc_meta_limit; + if (frac >= 4 * ofrac) + up /= frac / (2 * ofrac + 1); + up = (up << s) / (total >> (32 - s)); + if (ofrac >= 4 * frac) + down /= ofrac / (2 * frac + 1); + down = (down << s) / (total >> (32 - s)); + down = down * 100 / balance; - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); - adjustmnt -= delta; - } - - if (adjustmnt > 0 && - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { - delta = MIN(adjustmnt, - zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); - total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); - } - - /* - * If after attempting to make the requested adjustment to the ARC - * the meta limit is still being exceeded then request that the - * higher layers drop some cached objects which have holds on ARC - * meta buffers. Requests to the upper layers will be made with - * increasingly large scan sizes until the ARC is below the limit. - */ - if (meta_used > arc_meta_limit) { - if (type == ARC_BUFC_DATA) { - type = ARC_BUFC_METADATA; - } else { - type = ARC_BUFC_DATA; - - if (zfs_arc_meta_prune) { - prune += zfs_arc_meta_prune; - arc_prune_async(prune); - } - } - - if (restarts > 0) { - restarts--; - goto restart; - } - } - return (total_evicted); -} - -/* - * Evict metadata buffers from the cache, such that arcstat_meta_used is - * capped by the arc_meta_limit tunable. - */ -static uint64_t -arc_evict_meta_only(uint64_t meta_used) -{ - uint64_t total_evicted = 0; - int64_t target; - - /* - * If we're over the meta limit, we want to evict enough - * metadata to get back under the meta limit. We don't want to - * evict so much that we drop the MRU below arc_p, though. If - * we're over the meta limit more than we're over arc_p, we - * evict some from the MRU here, and some from the MFU below. - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); - - total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - - /* - * Similar to the above, we want to evict enough bytes to get us - * below the meta limit, but not so much as to drop us below the - * space allotted to the MFU (which is defined as arc_c - arc_p). - */ - target = MIN((int64_t)(meta_used - arc_meta_limit), - (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - - (arc_c - arc_p))); - - total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - - return (total_evicted); -} - -static uint64_t -arc_evict_meta(uint64_t meta_used) -{ - if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) - return (arc_evict_meta_only(meta_used)); - else - return (arc_evict_meta_balanced(meta_used)); -} - -/* - * Return the type of the oldest buffer in the given arc state - * - * This function will select a random sublist of type ARC_BUFC_DATA and - * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist - * is compared, and the type which contains the "older" buffer will be - * returned. - */ -static arc_buf_contents_t -arc_evict_type(arc_state_t *state) -{ - multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; - multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; - int data_idx = multilist_get_random_index(data_ml); - int meta_idx = multilist_get_random_index(meta_ml); - multilist_sublist_t *data_mls; - multilist_sublist_t *meta_mls; - arc_buf_contents_t type; - arc_buf_hdr_t *data_hdr; - arc_buf_hdr_t *meta_hdr; - - /* - * We keep the sublist lock until we're finished, to prevent - * the headers from being destroyed via arc_evict_state(). - */ - data_mls = multilist_sublist_lock(data_ml, data_idx); - meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - - /* - * These two loops are to ensure we skip any markers that - * might be at the tail of the lists due to arc_evict_state(). - */ - - for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; - data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { - if (data_hdr->b_spa != 0) - break; - } - - for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; - meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { - if (meta_hdr->b_spa != 0) - break; - } - - if (data_hdr == NULL && meta_hdr == NULL) { - type = ARC_BUFC_DATA; - } else if (data_hdr == NULL) { - ASSERT3P(meta_hdr, !=, NULL); - type = ARC_BUFC_METADATA; - } else if (meta_hdr == NULL) { - ASSERT3P(data_hdr, !=, NULL); - type = ARC_BUFC_DATA; - } else { - ASSERT3P(data_hdr, !=, NULL); - ASSERT3P(meta_hdr, !=, NULL); - - /* The headers can't be on the sublist without an L1 header */ - ASSERT(HDR_HAS_L1HDR(data_hdr)); - ASSERT(HDR_HAS_L1HDR(meta_hdr)); - - if (data_hdr->b_l1hdr.b_arc_access < - meta_hdr->b_l1hdr.b_arc_access) { - type = ARC_BUFC_DATA; - } else { - type = ARC_BUFC_METADATA; - } - } - - multilist_sublist_unlock(meta_mls); - multilist_sublist_unlock(data_mls); - - return (type); + return (frac + up - down); } /* @@ -4576,150 +4241,128 @@ arc_evict_type(arc_state_t *state) static uint64_t arc_evict(void) { - uint64_t total_evicted = 0; - uint64_t bytes; - int64_t target; - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); - uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used); - - /* - * If we're over arc_meta_limit, we want to correct that before - * potentially evicting data buffers below. - */ - total_evicted += arc_evict_meta(ameta); - - /* - * Adjust MRU size - * - * If we're over the target cache size, we want to evict enough - * from the list to get back to our target size. We don't want - * to evict too much from the MRU, such that it drops below - * arc_p. So, if we're over our target cache size more than - * the MRU is over arc_p, we'll evict enough to get back to - * arc_p here, and then evict more from the MFU below. - */ - target = MIN((int64_t)(asize - arc_c), - (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); - - /* - * If we're below arc_meta_min, always prefer to evict data. - * Otherwise, try to satisfy the requested number of bytes to - * evict from the type which contains older buffers; in an - * effort to keep newer buffers in the cache regardless of their - * type. If we cannot satisfy the number of bytes from this - * type, spill over into the next type. - */ - if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from metadata. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); - } + uint64_t asize, bytes, total_evicted = 0; + int64_t e, mrud, mrum, mfud, mfum, w; + static uint64_t ogrd, ogrm, ogfd, ogfm; + static uint64_t gsrd, gsrm, gsfd, gsfm; + uint64_t ngrd, ngrm, ngfd, ngfm; + + /* Get current size of ARC states we can evict from. */ + mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]); + mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + uint64_t d = mrud + mfud; + uint64_t m = mrum + mfum; + uint64_t t = d + m; + + /* Get ARC ghost hits since last eviction. */ + ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t grd = ngrd - ogrd; + ogrd = ngrd; + ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t grm = ngrm - ogrm; + ogrm = ngrm; + ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + uint64_t gfd = ngfd - ogfd; + ogfd = ngfd; + ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); + uint64_t gfm = ngfm - ogfm; + ogfm = ngfm; + + /* Adjust ARC states balance based on ghost hits. */ + arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm, + grm + gfm, grd + gfd, zfs_arc_meta_balance); + arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100); + arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100); - /* - * Re-sum ARC stats after the first round of evictions. - */ asize = aggsum_value(&arc_sums.arcstat_size); - ameta = aggsum_value(&arc_sums.arcstat_meta_used); - - - /* - * Adjust MFU size - * - * Now that we've tried to evict enough from the MRU to get its - * size back to arc_p, if we're still above the target cache - * size, we evict the rest from the MFU. - */ - target = asize - arc_c; - - if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && - ameta > arc_meta_min) { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * metadata, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - } else { - bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - /* - * If we couldn't evict our target number of bytes from - * data, we try to get the rest from data. - */ - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - } - - /* - * Adjust ghost lists - * - * In addition to the above, the ARC also defines target values - * for the ghost lists. The sum of the mru list and mru ghost - * list should never exceed the target size of the cache, and - * the sum of the mru list, mfu list, mru ghost list, and mfu - * ghost list should never exceed twice the target size of the - * cache. The following logic enforces these limits on the ghost - * caches, and evicts from them as needed. - */ - target = zfs_refcount_count(&arc_mru->arcs_size) + - zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - - bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + int64_t wt = t - (asize - arc_c); + + /* + * Try to reduce pinned dnodes if more than 3/4 of wanted metadata + * target is not evictable or if they go over arc_dnode_limit. + */ + int64_t prune = 0; + int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + w = wt * (int64_t)(arc_meta >> 16) >> 16; + if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) > + w * 3 / 4) { + prune = dn / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; + } else if (dn > arc_dnode_limit) { + prune = (dn - arc_dnode_limit) / sizeof (dnode_t) * + zfs_arc_dnode_reduce_percent / 100; + } + if (prune > 0) + arc_prune_async(prune); + + /* Evict MRU metadata. */ + w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); + bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); total_evicted += bytes; + mrum -= bytes; + asize -= bytes; - target -= bytes; + /* Evict MFU metadata. */ + w = wt * (int64_t)(arc_meta >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); + bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); + total_evicted += bytes; + mfum -= bytes; + asize -= bytes; + + /* Evict MRU data. */ + wt -= m - total_evicted; + w = wt * (int64_t)(arc_pd >> 16) >> 16; + e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); + bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); + total_evicted += bytes; + mrud -= bytes; + asize -= bytes; - total_evicted += - arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + /* Evict MFU data. */ + e = asize - arc_c; + bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e); + mfud -= bytes; + total_evicted += bytes; /* - * We assume the sum of the mru list and mfu list is less than - * or equal to arc_c (we enforced this above), which means we - * can use the simpler of the two equations below: + * Evict ghost lists * - * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c - * mru ghost + mfu ghost <= arc_c - */ - target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + - zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; - - bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); - total_evicted += bytes; - - target -= bytes; - - total_evicted += - arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + * Size of each state's ghost list represents how much that state + * may grow by shrinking the other states. Would it need to shrink + * other states to zero (that is unlikely), its ghost size would be + * equal to sum of other three state sizes. But excessive ghost + * size may result in false ghost hits (too far back), that may + * never result in real cache hits if several states are competing. + * So choose some arbitraty point of 1/2 of other state sizes. + */ + gsrd = (mrum + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) - + gsrd; + (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e); + + gsrm = (mrud + mfud + mfum) / 2; + e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsrm; + (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e); + + gsfd = (mrud + mrum + mfum) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) - + gsfd; + (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e); + + gsfm = (mrud + mrum + mfud) / 2; + e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) - + gsfm; + (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e); return (total_evicted); } @@ -4734,7 +4377,7 @@ arc_flush(spa_t *spa, boolean_t retry) * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ - ASSERT(!retry || spa == 0); + ASSERT(!retry || spa == NULL); if (spa != NULL) guid = spa_load_guid(spa); @@ -4750,12 +4393,18 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } void arc_reduce_target_size(int64_t to_free) { - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + uint64_t c = arc_c; + + if (c <= arc_c_min) + return; /* * All callers want the ARC to actually evict (at least) this much @@ -4765,26 +4414,16 @@ arc_reduce_target_size(int64_t to_free) * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ - uint64_t c = MIN(arc_c, asize); - - if (c > to_free && c - to_free > arc_c_min) { - arc_c = c - to_free; - atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); - if (arc_p > arc_c) - arc_p = (arc_c >> 1); - ASSERT(arc_c >= arc_c_min); - ASSERT((int64_t)arc_p >= 0); - } else { - arc_c = arc_c_min; - } + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); + if (asize < c) + to_free += c - asize; + arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); - if (asize > arc_c) { - /* See comment in arc_evict_cb_check() on why lock+flag */ - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - mutex_exit(&arc_evict_lock); - zthr_wakeup(arc_evict_zthr); - } + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); } /* @@ -4804,18 +4443,8 @@ arc_kmem_reap_soon(void) size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; - extern kmem_cache_t *zio_buf_cache[]; - extern kmem_cache_t *zio_data_buf_cache[]; #ifdef _KERNEL - if ((aggsum_compare(&arc_sums.arcstat_meta_used, - arc_meta_limit) >= 0) && zfs_arc_meta_prune) { - /* - * We are exceeding our meta-data cache limit. - * Prune some entries to release holds on meta-data. - */ - arc_prune_async(zfs_arc_meta_prune); - } #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. @@ -4846,10 +4475,11 @@ arc_kmem_reap_soon(void) abd_cache_reap_now(); } -/* ARGSUSED */ static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { + (void) arg, (void) zthr; + #ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information @@ -4882,22 +4512,38 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ - return (arc_evict_needed); + if (arc_evict_needed) + return (B_TRUE); + + /* + * If we have buffers in uncached state, evict them periodically. + */ + return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) && + ddi_get_lbolt() - arc_last_uncached_flush > + MSEC_TO_TICK(arc_min_prefetch_ms / 2))); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ -/* ARGSUSED */ static void arc_evict_cb(void *arg, zthr_t *zthr) { + (void) arg; + uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); - /* Evict from cache */ - evicted = arc_evict(); + /* Always try to evict from uncached state. */ + arc_last_uncached_flush = ddi_get_lbolt(); + evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE); + evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE); + + /* Evict from other states only if told to. */ + if (arc_evict_needed) + evicted += arc_evict(); /* * If evicted is zero, we couldn't evict anything @@ -4909,9 +4555,13 @@ arc_evict_cb(void *arg, zthr_t *zthr) * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. + * + * Note we cancel using zthr instead of arc_evict_zthr + * because the latter may not yet be initializd when the + * callback is first invoked. */ mutex_enter(&arc_evict_lock); - arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && + arc_evict_needed = !zthr_iscancelled(zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* @@ -4929,10 +4579,11 @@ arc_evict_cb(void *arg, zthr_t *zthr) spl_fstrans_unmark(cookie); } -/* ARGSUSED */ static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { + (void) arg, (void) zthr; + int64_t free_memory = arc_available_memory(); static int reap_cb_check_counter = 0; @@ -4976,10 +4627,11 @@ arc_reap_cb_check(void *arg, zthr_t *zthr) * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ -/* ARGSUSED */ static void arc_reap_cb(void *arg, zthr_t *zthr) { + (void) arg, (void) zthr; + int64_t free_memory; fstrans_cookie_t cookie = spl_fstrans_mark(); @@ -5009,10 +4661,11 @@ arc_reap_cb(void *arg, zthr_t *zthr) */ free_memory = arc_available_memory(); - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { - arc_reduce_target_size(to_free); + int64_t can_free = arc_c - arc_c_min; + if (can_free > 0) { + int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; + if (to_free > 0) + arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } @@ -5072,40 +4725,8 @@ arc_reap_cb(void *arg, zthr_t *zthr) * when we are adding new content to the cache. */ static void -arc_adapt(int bytes, arc_state_t *state) +arc_adapt(uint64_t bytes) { - int mult; - uint64_t arc_p_min = (arc_c >> arc_p_min_shift); - int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); - int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - - ASSERT(bytes > 0); - /* - * Adapt the target size of the MRU list: - * - if we just hit in the MRU ghost list, then increase - * the target size of the MRU list. - * - if we just hit in the MFU ghost list, then increase - * the target size of the MFU list by decreasing the - * target size of the MRU list. - */ - if (state == arc_mru_ghost) { - mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - - arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); - } else if (state == arc_mfu_ghost) { - uint64_t delta; - - mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); - if (!zfs_arc_p_dampener_disable) - mult = MIN(mult, 10); - - delta = MIN(bytes * mult, arc_p); - arc_p = MAX(arc_p_min, arc_p - delta); - } - ASSERT((int64_t)arc_p >= 0); - /* * Wake reap thread if we do not have any available memory */ @@ -5124,18 +4745,12 @@ arc_adapt(int bytes, arc_state_t *state) * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ - ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); - if (aggsum_upper_bound(&arc_sums.arcstat_size) >= - arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { - atomic_add_64(&arc_c, (int64_t)bytes); - if (arc_c > arc_c_max) + if (aggsum_upper_bound(&arc_sums.arcstat_size) + + 2 * SPA_MAXBLOCKSIZE >= arc_c) { + uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE); + if (atomic_add_64_nv(&arc_c, dc) > arc_c_max) arc_c = arc_c_max; - else if (state == arc_anon) - atomic_add_64(&arc_p, (int64_t)bytes); - if (arc_p > arc_c) - arc_p = arc_c; } - ASSERT((int64_t)arc_p >= 0); } /* @@ -5167,26 +4782,24 @@ arc_is_overflowing(boolean_t use_reserve) } static abd_t * -arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); - if (type == ARC_BUFC_METADATA) { - return (abd_alloc(size, B_TRUE)); - } else { - ASSERT(type == ARC_BUFC_DATA); - return (abd_alloc(size, B_FALSE)); - } + if (alloc_flags & ARC_HDR_ALLOC_LINEAR) + return (abd_alloc_linear(size, type == ARC_BUFC_METADATA)); + else + return (abd_alloc(size, type == ARC_BUFC_METADATA)); } static void * -arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); + arc_get_data_impl(hdr, size, tag, 0); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5281,14 +4894,10 @@ arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) * limit, we'll only signal the reclaim thread and continue on. */ static void -arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); - - if (alloc_flags & ARC_HDR_DO_ADAPT) - arc_adapt(size, state); + arc_adapt(size); /* * If arc_size is currently overflowing, we must be adding data @@ -5306,7 +4915,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); - VERIFY3U(hdr->b_type, ==, type); + arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { @@ -5317,9 +4926,11 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ + arc_state_t *state = hdr->b_l1hdr.b_state; if (!GHOST_STATE(state)) { - (void) zfs_refcount_add_many(&state->arcs_size, size, tag); + (void) zfs_refcount_add_many(&state->arcs_size[type], size, + tag); /* * If this is reached via arc_read, the link is @@ -5335,28 +4946,19 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } - - /* - * If we are growing the cache, and we are adding anonymous - * data, and we have outgrown arc_p, update arc_p - */ - if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && - hdr->b_l1hdr.b_state == arc_anon && - (zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) - arc_p = MIN(arc_c, arc_p + size); } } static void -arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) +arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, + const void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void -arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) +arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); @@ -5373,7 +4975,7 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) * Free the arc data buffer. */ static void -arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); @@ -5386,7 +4988,7 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } - (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); + (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { @@ -5399,150 +5001,155 @@ arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) /* * This routine is called whenever a buffer is accessed. - * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit) { - clock_t now; - - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); + /* + * Update buffer prefetch status. + */ + boolean_t was_prefetch = HDR_PREFETCH(hdr); + boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH; + if (was_prefetch != now_prefetch) { + if (was_prefetch) { + ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit, + HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive, + prefetch); + } + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); + if (was_prefetch) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + } + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } + if (now_prefetch) { + if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + ARCSTAT_BUMP(arcstat_prescient_prefetch); + } else { + ARCSTAT_BUMP(arcstat_predictive_prefetch); + } + } + if (arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + + clock_t now = ddi_get_lbolt(); if (hdr->b_l1hdr.b_state == arc_anon) { + arc_state_t *new_state; /* - * This buffer is not in the cache, and does not - * appear in our "ghost" list. Add the new buffer - * to the MRU state. + * This buffer is not in the cache, and does not appear in + * our "ghost" lists. Add it to the MRU or uncached state. */ - ASSERT0(hdr->b_l1hdr.b_arc_access); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mru, hdr, hash_lock); - + hdr->b_l1hdr.b_arc_access = now; + if (HDR_UNCACHED(hdr)) { + new_state = arc_uncached; + DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *, + hdr); + } else { + new_state = arc_mru; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + } + arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mru) { - now = ddi_get_lbolt(); + /* + * This buffer has been accessed once recently and either + * its read is still in progress or it is in the cache. + */ + if (HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_arc_access = now; + return; + } + hdr->b_l1hdr.b_mru_hits++; + ARCSTAT_BUMP(arcstat_mru_hits); /* - * If this buffer is here because of a prefetch, then either: - * - clear the flag if this is a "referencing" read - * (any subsequent access will bump this into the MFU state). - * or - * - move the buffer to the head of the list if this is - * another prefetch (to make it less likely to be evicted). + * If the previous access was a prefetch, then it already + * handled possible promotion, so nothing more to do for now. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { - /* link protected by hash lock */ - ASSERT(multilist_link_active( - &hdr->b_l1hdr.b_arc_node)); - } else { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } + if (was_prefetch) { hdr->b_l1hdr.b_arc_access = now; return; } /* - * This buffer has been "accessed" only once so far, - * but it is still in the cache. Move it to the MFU - * state. + * If more than ARC_MINTIME have passed from the previous + * hit, promote the buffer to the MFU state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + arc_change_state(arc_mfu, hdr); } - hdr->b_l1hdr.b_mru_hits++; - ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* - * This buffer has been "accessed" recently, but - * was evicted from the cache. Move it to the - * MFU state. + * This buffer has been accessed once recently, but was + * evicted from the cache. Would we have bigger MRU, it + * would be an MRU hit, so handle it the same way, except + * we don't need to check the previous access time. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + hdr->b_l1hdr.b_mru_ghost_hits++; + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); + if (was_prefetch) { new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREFETCH | - ARC_FLAG_PRESCIENT_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, hdr, hash_lock); - - hdr->b_l1hdr.b_mru_ghost_hits++; - ARCSTAT_BUMP(arcstat_mru_ghost_hits); + arc_change_state(new_state, hdr); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* - * This buffer has been accessed more than once and is - * still in the cache. Keep it in the MFU state. - * - * NOTE: an add_reference() that occurred when we did - * the arc_read() will have kicked this off the list. - * If it was a prefetch, we will explicitly move it to - * the head of the list now. + * This buffer has been accessed more than once and either + * still in the cache or being restored from one of ghosts. */ - - hdr->b_l1hdr.b_mfu_hits++; - ARCSTAT_BUMP(arcstat_mfu_hits); - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + if (!HDR_IO_IN_PROGRESS(hdr)) { + hdr->b_l1hdr.b_mfu_hits++; + ARCSTAT_BUMP(arcstat_mfu_hits); + } + hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { - arc_state_t *new_state = arc_mfu; /* - * This buffer has been accessed more than once but has - * been evicted from the cache. Move it back to the - * MFU state. + * This buffer has been accessed more than once recently, but + * has been evicted from the cache. Would we have bigger MFU + * it would stay in cache, so move it back to MFU state. */ - - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { - /* - * This is a prefetch access... - * move this block back to the MRU state. - */ - new_state = arc_mru; - } - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(new_state, hdr, hash_lock); - hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + hdr->b_l1hdr.b_arc_access = now; + wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)], + arc_hdr_size(hdr)); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr); + } else if (hdr->b_l1hdr.b_state == arc_uncached) { + /* + * This buffer is uncacheable, but we got a hit. Probably + * a demand read after prefetch. Nothing more to do here. + */ + if (!HDR_IO_IN_PROGRESS(hdr)) + ARCSTAT_BUMP(arcstat_uncached_hits); + hdr->b_l1hdr.b_arc_access = now; } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* - * This buffer is on the 2nd Level ARC. + * This buffer is on the 2nd Level ARC and was not accessed + * for a long time, so treat it as new and put into MRU. */ - - hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); - arc_change_state(arc_mfu, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); @@ -5556,7 +5163,6 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) void arc_buf_access(arc_buf_t *buf) { - mutex_enter(&buf->b_evict_lock); arc_buf_hdr_t *hdr = buf->b_hdr; /* @@ -5564,54 +5170,51 @@ arc_buf_access(arc_buf_t *buf) * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ - if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { - mutex_exit(&buf->b_evict_lock); + if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) return; - } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); - mutex_exit(&buf->b_evict_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } - mutex_exit(&buf->b_evict_lock); - ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); + hdr->b_l1hdr.b_state == arc_mfu || + hdr->b_l1hdr.b_state == arc_uncached); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_TRUE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); + ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch, + !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ -/* ARGSUSED */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { + (void) zio, (void) zb, (void) bp; + if (buf == NULL) return; - bcopy(buf->b_data, arg, arc_buf_size(buf)); + memcpy(arg, buf->b_data, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ -/* ARGSUSED */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { + (void) zb, (void) bp; arc_buf_t **bufp = arg; if (buf == NULL) { @@ -5649,7 +5252,6 @@ arc_read_done(zio_t *zio) kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; - boolean_t freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -5662,7 +5264,7 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; - ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, @@ -5682,17 +5284,20 @@ arc_read_done(zio_t *zio) zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); - if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { - void *tmpbuf; - - tmpbuf = abd_borrow_buf_copy(zio->io_abd, - sizeof (zil_chain_t)); - zio_crypt_decode_mac_zil(tmpbuf, - hdr->b_crypt_hdr.b_mac); - abd_return_buf(zio->io_abd, tmpbuf, - sizeof (zil_chain_t)); - } else { - zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + if (zio->io_error == 0) { + if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { + void *tmpbuf; + + tmpbuf = abd_borrow_buf_copy(zio->io_abd, + sizeof (zil_chain_t)); + zio_crypt_decode_mac_zil(tmpbuf, + hdr->b_crypt_hdr.b_mac); + abd_return_buf(zio->io_abd, tmpbuf, + sizeof (zil_chain_t)); + } else { + zio_crypt_decode_mac_bp(bp, + hdr->b_crypt_hdr.b_mac); + } } } @@ -5719,17 +5324,7 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - arc_access(hdr, hash_lock); - } + hdr->b_l1hdr.b_acb = NULL; /* * If a read request has a callback (i.e. acb_done is not NULL), then we @@ -5739,6 +5334,10 @@ arc_read_done(zio_t *zio) */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + + /* We need the last one to call below in original order. */ + callback_list = acb; + if (!acb->acb_done || acb->acb_nobuf) continue; @@ -5767,7 +5366,8 @@ arc_read_done(zio_t *zio) ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { - spa_log_error(zio->io_spa, &acb->acb_zb); + spa_log_error(zio->io_spa, &acb->acb_zb, + BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); @@ -5802,44 +5402,21 @@ arc_read_done(zio_t *zio) */ ASSERT(callback_cnt < 2 || hash_lock != NULL); - hdr->b_l1hdr.b_acb = NULL; - arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (callback_cnt == 0) - ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); - - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || - callback_list != NULL); - if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } - /* - * Broadcast before we drop the hash_lock to avoid the possibility - * that the hdr (and hence the cv) might be freed before we get to - * the cv_broadcast(). - */ - cv_broadcast(&hdr->b_l1hdr.b_cv); + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + (void) remove_reference(hdr, hdr); - if (hash_lock != NULL) { + if (hash_lock != NULL) mutex_exit(hash_lock); - } else { - /* - * This block was freed while we waited for the read to - * complete. It has been removed from the hash table and - * moved to the anonymous state (so that it won't show up - * in the cache). - */ - ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); - freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); - } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { @@ -5863,12 +5440,18 @@ arc_read_done(zio_t *zio) zio_nowait(acb->acb_zio_dummy); } - callback_list = acb->acb_next; - kmem_free(acb, sizeof (arc_callback_t)); + callback_list = acb->acb_prev; + if (acb->acb_wait) { + mutex_enter(&acb->acb_wait_lock); + acb->acb_wait_error = zio->io_error; + acb->acb_wait = B_FALSE; + cv_signal(&acb->acb_wait_cv); + mutex_exit(&acb->acb_wait_lock); + /* acb will be freed by the waiting thread. */ + } else { + kmem_free(acb, sizeof (arc_callback_t)); + } } - - if (freeable) - arc_hdr_destroy(hdr); } /* @@ -5905,6 +5488,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; + arc_buf_t *buf = NULL; int rc = 0; ASSERT(!embedded_bp || @@ -5931,10 +5515,10 @@ top: * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ - if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_LOG)) { + if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); - goto out; + goto done; } if (!embedded_bp) { @@ -5954,19 +5538,17 @@ top: */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { - arc_buf_t *buf = NULL; - *arc_flags |= ARC_FLAG_CACHED; + boolean_t is_data = !HDR_ISTYPE_METADATA(hdr); if (HDR_IO_IN_PROGRESS(hdr)) { - zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; - if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); - goto out; + goto done; } + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { @@ -5980,21 +5562,28 @@ top: arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - if (*arc_flags & ARC_FLAG_WAIT) { - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); - mutex_exit(hash_lock); - goto top; - } - ASSERT(*arc_flags & ARC_FLAG_NOWAIT); - - if (done) { - arc_callback_t *acb = NULL; + DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr); + arc_access(hdr, *arc_flags, B_FALSE); + /* + * If there are multiple threads reading the same block + * and that block is not yet in the ARC, then only one + * thread will do the physical I/O and all other + * threads will wait until that I/O completes. + * Synchronous reads use the acb_wait_cv whereas nowait + * reads register a callback. Both are signalled/called + * in arc_read_done. + * + * Errors of the physical I/O may need to be propagated. + * Synchronous read errors are returned here from + * arc_read_done via acb_wait_error. Nowait reads + * attach the acb_zio_dummy zio to pio and + * arc_read_done propagates the physical I/O's io_error + * to acb_zio_dummy, and thereby to pio. + */ + arc_callback_t *acb = NULL; + if (done || pio || *arc_flags & ARC_FLAG_WAIT) { acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; @@ -6003,46 +5592,52 @@ top: acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; + if (*arc_flags & ARC_FLAG_WAIT) { + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, + CV_DEFAULT, NULL); + } acb->acb_zb = *zb; - if (pio != NULL) + if (pio != NULL) { acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - - ASSERT3P(acb->acb_done, !=, NULL); + } acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); + + ARCSTAT_BUMP(arcstat_iohits); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, iohits); + + if (*arc_flags & ARC_FLAG_WAIT) { + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + rc = acb->acb_wait_error; + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); + } goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || - hdr->b_l1hdr.b_state == arc_mfu); + hdr->b_l1hdr.b_state == arc_mfu || + hdr->b_l1hdr.b_state == arc_uncached); - if (done && !no_buf) { - if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { - /* - * This is a demand read which does not have to - * wait for i/o because we did a predictive - * prefetch i/o for it, which has completed. - */ - DTRACE_PROBE1( - arc__demand__hit__predictive__prefetch, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP( - arcstat_demand_hit_predictive_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PREDICTIVE_PREFETCH); - } - - if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { - ARCSTAT_BUMP( - arcstat_demand_hit_prescient_prefetch); - arc_hdr_clear_flags(hdr, - ARC_FLAG_PRESCIENT_PREFETCH); - } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, *arc_flags, B_TRUE); + if (done && !no_buf) { ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ @@ -6057,44 +5652,28 @@ top: */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { - spa_log_error(spa, zb); + spa_log_error(spa, zb, hdr->b_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } } if (rc != 0) { - (void) remove_reference(hdr, hash_lock, - private); arc_buf_destroy_impl(buf); buf = NULL; + (void) remove_reference(hdr, private); } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); - } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); } - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), - demand, prefetch, !HDR_ISTYPE_METADATA(hdr), - data, metadata, hits); - - if (done) - done(NULL, zb, bp, buf, private); + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), + demand, prefetch, is_data, data, metadata, hits); + *arc_flags |= ARC_FLAG_CACHED; + goto done; } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); @@ -6105,12 +5684,13 @@ top: uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); if (*arc_flags & ARC_FLAG_CACHED_ONLY) { - rc = SET_ERROR(ENOENT); if (hash_lock != NULL) mutex_exit(hash_lock); - goto out; + rc = SET_ERROR(ENOENT); + goto done; } if (hdr == NULL) { @@ -6119,13 +5699,12 @@ top: * embedded data. */ arc_buf_hdr_t *exists = NULL; - arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + hdr = arc_hdr_alloc(guid, psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + hdr->b_birth = BP_GET_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -6135,7 +5714,6 @@ top: arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data @@ -6155,7 +5733,9 @@ top: ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); +#ifdef ZFS_DEBUG ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); +#endif } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress @@ -6166,25 +5746,47 @@ top: * and so the performance impact shouldn't * matter. */ - cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + arc_callback_t *acb = kmem_zalloc( + sizeof (arc_callback_t), KM_SLEEP); + acb->acb_wait = B_TRUE; + mutex_init(&acb->acb_wait_lock, NULL, + MUTEX_DEFAULT, NULL); + cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT, + NULL); + acb->acb_zio_head = + hdr->b_l1hdr.b_acb->acb_zio_head; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb->acb_prev = acb; + hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); + mutex_enter(&acb->acb_wait_lock); + while (acb->acb_wait) { + cv_wait(&acb->acb_wait_cv, + &acb->acb_wait_lock); + } + mutex_exit(&acb->acb_wait_lock); + mutex_destroy(&acb->acb_wait_lock); + cv_destroy(&acb->acb_wait_cv); + kmem_free(acb, sizeof (arc_callback_t)); goto top; } - - /* - * This is a delicate dance that we play here. - * This hdr might be in the ghost list so we access - * it to move it out of the ghost list before we - * initiate the read. If it's a prefetch then - * it won't have a callback so we'll remove the - * reference that arc_buf_alloc_impl() created. We - * do this after we've called arc_access() to - * avoid hitting an assert in remove_reference(). - */ - arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); - arc_access(hdr, hash_lock); + } + if (*arc_flags & ARC_FLAG_UNCACHED) { + arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); + if (!encrypted_read) + alloc_flags |= ARC_HDR_ALLOC_LINEAR; } + /* + * Take additional reference for IO_IN_PROGRESS. It stops + * arc_access() from putting this header without any buffers + * and so other references but obviously nonevictable onto + * the evictable list of MRU or MFU state. + */ + add_reference(hdr, hdr); + if (!embedded_bp) + arc_access(hdr, *arc_flags, B_FALSE); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); @@ -6211,24 +5813,10 @@ top: zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } - if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_decrement_state(hdr); - arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); - if (HDR_HAS_L2HDR(hdr)) - l2arc_hdr_arcstats_increment_state(hdr); - } - if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); - if (*arc_flags & ARC_FLAG_L2CACHE) - arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); - if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) - arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -6241,7 +5829,6 @@ top: ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { @@ -6282,7 +5869,7 @@ top: blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); @@ -6300,11 +5887,9 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && - !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; @@ -6356,8 +5941,7 @@ top: asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | + zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; @@ -6436,6 +6020,16 @@ out: spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); + +done: + if (done) + done(NULL, zb, bp, buf, private); + if (pio && rc != 0) { + zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); + zio->io_error = rc; + zio_nowait(zio); + } + goto out; } arc_prune_t * @@ -6476,6 +6070,56 @@ arc_remove_prune_callback(arc_prune_t *p) } /* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + (void) zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffers they reference. This provides a mechanism to ensure the ARC can + * honor the metadata limit and reclaim otherwise pinned ARC buffers. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +static void +arc_prune_async(uint64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + (void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + +/* * Notify the arc that a block was freed, and thus will never be used again. */ void @@ -6493,10 +6137,8 @@ arc_freed(spa_t *spa, const blkptr_t *bp) /* * We might be trying to free a block that is still doing I/O - * (i.e. prefetch) or has a reference (i.e. a dedup-ed, - * dmu_sync-ed block). If this block is being prefetched, then it - * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr - * until the I/O completes. A block may also have a reference if it is + * (i.e. prefetch) or has some other reference (i.e. a dedup-ed, + * dmu_sync-ed block). A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU @@ -6513,9 +6155,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp) * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ - if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { - arc_change_state(arc_anon, hdr, hash_lock); + if (!HDR_HAS_L1HDR(hdr) || + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { @@ -6531,7 +6173,7 @@ arc_freed(spa_t *spa, const blkptr_t *bp) * a new hdr for the buffer. */ void -arc_release(arc_buf_t *buf, void *tag) +arc_release(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -6541,8 +6183,6 @@ arc_release(arc_buf_t *buf, void *tag) * But we don't know that information at this level. */ - mutex_enter(&buf->b_evict_lock); - ASSERT(HDR_HAS_L1HDR(hdr)); /* @@ -6551,14 +6191,14 @@ arc_release(arc_buf_t *buf, void *tag) * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { - mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); - ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; @@ -6607,7 +6247,7 @@ arc_release(arc_buf_t *buf, void *tag) /* * Do we have more than one buf? */ - if (hdr->b_l1hdr.b_bufcnt > 1) { + if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); @@ -6618,9 +6258,9 @@ arc_release(arc_buf_t *buf, void *tag) VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - (void) remove_reference(hdr, hash_lock, tag); + VERIFY3S(remove_reference(hdr, tag), >, 0); - if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } @@ -6637,9 +6277,9 @@ arc_release(arc_buf_t *buf, void *tag) * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); - VERIFY(!arc_buf_is_shared(lastbuf)); + ASSERT(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between @@ -6656,7 +6296,7 @@ arc_release(arc_buf_t *buf, void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + arc_hdr_alloc_abd(hdr, 0); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -6672,13 +6312,13 @@ arc_release(arc_buf_t *buf, void *tag) */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); - ASSERT(!ARC_BUF_SHARED(buf)); + ASSERT(!arc_buf_is_shared(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); - (void) zfs_refcount_remove_many(&state->arcs_size, + (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { @@ -6688,10 +6328,6 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_size(buf), buf); } - hdr->b_l1hdr.b_bufcnt -= 1; - if (ARC_BUF_ENCRYPTED(buf)) - hdr->b_crypt_hdr.b_ebufcnt -= 1; - arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -6701,30 +6337,20 @@ arc_release(arc_buf_t *buf, void *tag) mutex_exit(hash_lock); - /* - * Allocate a new hdr. The new hdr will contain a b_pabd - * buffer which will be freed in arc_write(). - */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); - ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; - nhdr->b_l1hdr.b_bufcnt = 1; - if (ARC_BUF_ENCRYPTED(buf)) - nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; - mutex_exit(&buf->b_evict_lock); - (void) zfs_refcount_add_many(&arc_anon->arcs_size, + (void) zfs_refcount_add_many(&arc_anon->arcs_size[type], arc_buf_size(buf), buf); } else { - mutex_exit(&buf->b_evict_lock); ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); @@ -6733,7 +6359,7 @@ arc_release(arc_buf_t *buf, void *tag) hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); @@ -6745,25 +6371,15 @@ arc_release(arc_buf_t *buf, void *tag) int arc_released(arc_buf_t *buf) { - int released; - - mutex_enter(&buf->b_evict_lock); - released = (buf->b_data != NULL && + return (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); - mutex_exit(&buf->b_evict_lock); - return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { - int referenced; - - mutex_enter(&buf->b_evict_lock); - referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); - mutex_exit(&buf->b_evict_lock); - return (referenced); + return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); } #endif @@ -6779,7 +6395,7 @@ arc_write_ready(zio_t *zio) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); - ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); /* * If we're reexecuting this zio because the pool suspended, then @@ -6790,9 +6406,10 @@ arc_write_ready(zio_t *zio) arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } } @@ -6807,18 +6424,16 @@ arc_write_ready(zio_t *zio) callback->awcb_ready(zio, buf, callback->awcb_private); - if (HDR_IO_IN_PROGRESS(hdr)) + if (HDR_IO_IN_PROGRESS(hdr)) { ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - - arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - - if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) - hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */ + } if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); - ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { @@ -6831,11 +6446,14 @@ arc_write_ready(zio_t *zio) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } else { + arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED); } /* @@ -6886,10 +6504,11 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); - } else if (!abd_size_alloc_linear(arc_buf_size(buf)) || + } else if (!(HDR_UNCACHED(hdr) || + abd_size_alloc_linear(arc_buf_size(buf))) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the @@ -6898,26 +6517,25 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA | + ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | - ARC_HDR_USE_RESERVE); + arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf); + ASSERT(ARC_BUF_LAST(buf)); arc_share_buf(hdr, buf); } @@ -6936,18 +6554,6 @@ arc_write_children_ready(zio_t *zio) callback->awcb_children_ready(zio, buf, callback->awcb_private); } -/* - * The SPA calls this callback for each physical write that happens on behalf - * of a logical write. See the comment in dbuf_write_physdone() for details. - */ -static void -arc_write_physdone(zio_t *zio) -{ - arc_write_callback_t *cb = zio->io_private; - if (cb->awcb_physdone != NULL) - cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); -} - static void arc_write_done(zio_t *zio) { @@ -6964,7 +6570,7 @@ arc_write_done(zio_t *zio) buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_birth = BP_GET_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); @@ -6997,7 +6603,7 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); + arc_change_state(arc_anon, exists); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); @@ -7010,22 +6616,24 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_l1hdr.b_bufcnt == 1); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); + ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + VERIFY3S(remove_reference(hdr, hdr), >, 0); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) - arc_access(hdr, hash_lock); + arc_access(hdr, 0, B_FALSE); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + VERIFY3S(remove_reference(hdr, hdr), >, 0); } - ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); @@ -7034,11 +6642,11 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, + blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, - arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, - arc_write_done_func_t *done, void *private, zio_priority_t priority, - int zio_flags, const zbookmark_phys_t *zb) + arc_write_done_func_t *children_ready, arc_write_done_func_t *done, + void *private, zio_priority_t priority, int zio_flags, + const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; @@ -7050,8 +6658,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); - if (l2arc) + ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); + if (uncached) + arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); + else if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { @@ -7062,11 +6672,11 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; - bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt, + memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); - bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv, + memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); - bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac, + memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; @@ -7083,7 +6693,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; - callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; @@ -7099,9 +6708,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ - if (arc_buf_is_shared(buf)) { + if (ARC_BUF_SHARED(buf)) { arc_unshare_buf(hdr, buf); } else { + ASSERT(!arc_buf_is_shared(buf)); arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); @@ -7120,8 +6730,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, - arc_write_physdone, arc_write_done, callback, - priority, zio_flags, zb); + arc_write_done, callback, priority, zio_flags, zb); return (zio); } @@ -7162,7 +6771,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); - anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - + anon_size = MAX((int64_t) + (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) - arc_loaned_bytes), 0); /* @@ -7218,9 +6829,14 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *data, kstat_named_t *metadata, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { - size->value.ui64 = zfs_refcount_count(&state->arcs_size); + data->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]); + metadata->value.ui64 = + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); + size->value.ui64 = data->value.ui64 + metadata->value.ui64; evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = @@ -7237,22 +6853,32 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); + as->arcstat_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_iohits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); + as->arcstat_demand_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_data_iohits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); + as->arcstat_demand_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_metadata_iohits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); + as->arcstat_prefetch_data_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_data_iohits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); + as->arcstat_prefetch_metadata_iohits.value.ui64 = + wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = @@ -7263,6 +6889,8 @@ arc_kstat_update(kstat_t *ksp, int rw) wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); + as->arcstat_uncached_hits.value.ui64 = + wmsum_value(&arc_sums.arcstat_uncached_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = @@ -7308,33 +6936,49 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - aggsum_value(&arc_sums.arcstat_dnode_size) + + wmsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, + &as->arcstat_anon_data, + &as->arcstat_anon_metadata, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, + &as->arcstat_mru_data, + &as->arcstat_mru_metadata, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_data, + &as->arcstat_mru_ghost_metadata, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, + &as->arcstat_mfu_data, + &as->arcstat_mfu_metadata, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_data, + &as->arcstat_mfu_ghost_metadata, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); + arc_kstat_update_state(arc_uncached, + &as->arcstat_uncached_size, + &as->arcstat_uncached_data, + &as->arcstat_uncached_metadata, + &as->arcstat_uncached_evictable_data, + &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - aggsum_value(&arc_sums.arcstat_dnode_size); + wmsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7432,13 +7076,21 @@ arc_kstat_update(kstat_t *ksp, int rw) as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = - aggsum_value(&arc_sums.arcstat_meta_used); + wmsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); + as->arcstat_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_predictive_prefetch); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); + as->arcstat_demand_iohit_predictive_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + as->arcstat_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_prescient_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); + as->arcstat_demand_iohit_prescient_prefetch.value.ui64 = + wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = @@ -7510,7 +7162,6 @@ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); - unsigned long limit; /* Valid range: 32M - <arc_c_max> */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && @@ -7527,44 +7178,15 @@ arc_tuning_update(boolean_t verbose) (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); - arc_p = (arc_c >> 1); - if (arc_meta_limit > arc_c_max) - arc_meta_limit = arc_c_max; - if (arc_dnode_size_limit > arc_meta_limit) - arc_dnode_size_limit = arc_meta_limit; + if (arc_dnode_limit > arc_c_max) + arc_dnode_limit = arc_c_max; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); - /* Valid range: 16M - <arc_c_max> */ - if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && - (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && - (zfs_arc_meta_min <= arc_c_max)) { - arc_meta_min = zfs_arc_meta_min; - if (arc_meta_limit < arc_meta_min) - arc_meta_limit = arc_meta_min; - if (arc_dnode_size_limit < arc_meta_min) - arc_dnode_size_limit = arc_meta_min; - } - WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); - - /* Valid range: <arc_meta_min> - <arc_c_max> */ - limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : - MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; - if ((limit != arc_meta_limit) && - (limit >= arc_meta_min) && - (limit <= arc_c_max)) - arc_meta_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); - - /* Valid range: <arc_meta_min> - <arc_meta_limit> */ - limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : - MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; - if ((limit != arc_dnode_size_limit) && - (limit >= arc_meta_min) && - (limit <= arc_meta_limit)) - arc_dnode_size_limit = limit; - WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, - verbose); + /* Valid range: 0 - <all physical memory> */ + arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : + MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100; + WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) @@ -7576,10 +7198,6 @@ arc_tuning_update(boolean_t verbose) arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } - /* Valid range: 1 - N */ - if (zfs_arc_p_min_shift) - arc_p_min_shift = zfs_arc_p_min_shift; - /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; @@ -7591,65 +7209,67 @@ arc_tuning_update(boolean_t verbose) } /* Valid range: 0 - 100 */ - if ((zfs_arc_lotsfree_percent >= 0) && - (zfs_arc_lotsfree_percent <= 100)) + if (zfs_arc_lotsfree_percent <= 100) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - <all physical memory> */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) - arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem); + arc_sys_free = MIN(zfs_arc_sys_free, allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void +arc_state_multilist_init(multilist_t *ml, + multilist_sublist_index_func_t *index_func, int *maxcountp) +{ + multilist_create(ml, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func); + *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml)); +} + +static void arc_state_init(void) { - multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); - multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_multilist_index_func); + int num_sublists = 0; + + arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA], + arc_state_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA], + arc_state_multilist_index_func, &num_sublists); + /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. Special index function asserts that. */ - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_l2c_multilist_index_func); - multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), - arc_state_l2c_multilist_index_func); + arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + arc_state_l2c_multilist_index_func, &num_sublists); + arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + arc_state_l2c_multilist_index_func, &num_sublists); + + /* + * Keep track of the number of markers needed to reclaim buffers from + * any ARC state. The markers will be pre-allocated so as to minimize + * the number of memory allocations performed by the eviction thread. + */ + arc_state_evict_marker_count = num_sublists; zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); @@ -7663,28 +7283,49 @@ arc_state_init(void) zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); - - zfs_refcount_create(&arc_anon->arcs_size); - zfs_refcount_create(&arc_mru->arcs_size); - zfs_refcount_create(&arc_mru_ghost->arcs_size); - zfs_refcount_create(&arc_mfu->arcs_size); - zfs_refcount_create(&arc_mfu_ghost->arcs_size); - zfs_refcount_create(&arc_l2c_only->arcs_size); + zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); + + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); + + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0); + wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0); wmsum_init(&arc_sums.arcstat_hits, 0); + wmsum_init(&arc_sums.arcstat_iohits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); + wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); + wmsum_init(&arc_sums.arcstat_uncached_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); @@ -7706,7 +7347,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - aggsum_init(&arc_sums.arcstat_dnode_size, 0); + wmsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7751,10 +7392,14 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); - aggsum_init(&arc_sums.arcstat_meta_used, 0); + wmsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); + wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0); + wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); + wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); @@ -7765,6 +7410,7 @@ arc_state_init(void) arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; + arc_uncached->arcs_state = ARC_STATE_UNCACHED; } static void @@ -7782,13 +7428,23 @@ arc_state_fini(void) zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); - - zfs_refcount_destroy(&arc_anon->arcs_size); - zfs_refcount_destroy(&arc_mru->arcs_size); - zfs_refcount_destroy(&arc_mru_ghost->arcs_size); - zfs_refcount_destroy(&arc_mfu->arcs_size); - zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); - zfs_refcount_destroy(&arc_l2c_only->arcs_size); + zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]); + + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); @@ -7800,21 +7456,34 @@ arc_state_fini(void) multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]); + + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]); + wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]); wmsum_fini(&arc_sums.arcstat_hits); + wmsum_fini(&arc_sums.arcstat_iohits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); + wmsum_fini(&arc_sums.arcstat_demand_data_iohits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); + wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); + wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); + wmsum_fini(&arc_sums.arcstat_uncached_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); @@ -7836,7 +7505,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - aggsum_fini(&arc_sums.arcstat_dnode_size); + wmsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); @@ -7881,10 +7550,14 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); - aggsum_fini(&arc_sums.arcstat_meta_used); + wmsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); + wmsum_fini(&arc_sums.arcstat_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch); + wmsum_fini(&arc_sums.arcstat_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); + wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); @@ -7949,18 +7622,16 @@ arc_init(void) #endif arc_c = arc_c_min; - arc_p = (arc_c >> 1); - - /* Set min to 1/2 of arc_c_min */ - arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; /* - * Set arc_meta_limit to a percent of arc_c_max with a floor of - * arc_meta_min, and a ceiling of arc_c_max. + * 32-bit fixed point fractions of metadata from total ARC size, + * MRU data from all data and MRU metadata from all metadata. */ - percent = MIN(zfs_arc_meta_limit_percent, 100); - arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); + arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */ + arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */ + arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */ + percent = MIN(zfs_arc_dnode_limit_percent, 100); - arc_dnode_size_limit = (percent * arc_meta_limit) / 100; + arc_dnode_limit = arc_c_max * percent / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); @@ -7981,9 +7652,8 @@ arc_init(void) offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri, - boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | - TASKQ_THREADS_CPU_PCT); + arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, + defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7994,8 +7664,10 @@ arc_init(void) kstat_install(arc_ksp); } - arc_evict_zthr = zthr_create("arc_evict", - arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri); + arc_state_evict_markers = + arc_state_alloc_markers(arc_state_evict_marker_count); + arc_evict_zthr = zthr_create_timer("arc_evict", + arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); @@ -8060,9 +7732,8 @@ arc_fini(void) taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); - while ((p = list_head(&arc_prune_list)) != NULL) { - list_remove(&arc_prune_list, p); - zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); + while ((p = list_remove_head(&arc_prune_list)) != NULL) { + (void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } @@ -8073,6 +7744,8 @@ arc_fini(void) (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); + arc_state_free_markers(arc_state_evict_markers, + arc_state_evict_marker_count); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); @@ -8367,7 +8040,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size, tsize; + uint64_t size; /* * Make sure our globals have meaningful values in case the user @@ -8375,38 +8048,33 @@ l2arc_write_size(l2arc_dev_t *dev) */ size = l2arc_write_max; if (size == 0) { - cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " - "be greater than zero, resetting it to the default (%d)", - L2ARC_WRITE_SIZE); + cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, " + "resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* We need to add in the worst case scenario of log block overhead. */ + size += l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the writesize, whichever is greater. + */ + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); + } + /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ - dev_size = dev->l2ad_end - dev->l2ad_start; - tsize = size + l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) - tsize += MAX(64 * 1024 * 1024, - (tsize * l2arc_trim_ahead) / 100); + size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4); - if (tsize >= dev_size) { - cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " - "plus the overhead of log blocks (persistent L2ARC, " - "%llu bytes) exceeds the size of the cache device " - "(guid %llu), resetting them to the default (%d)", - (u_longlong_t)l2arc_log_blk_overhead(size, dev), - (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); - size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; - - if (arc_warm == B_FALSE) - size += l2arc_write_boost; - } + size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift); return (size); @@ -8473,12 +8141,13 @@ l2arc_dev_get_next(void) else if (next == first) break; + ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all); + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || - next->l2ad_trim_all) + next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting) next = NULL; l2arc_dev_last = next; @@ -8503,20 +8172,14 @@ out: static void l2arc_do_free_on_write(void) { - list_t *buflist; - l2arc_data_free_t *df, *df_prev; + l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); - buflist = l2arc_free_on_write; - - for (df = list_tail(buflist); df; df = df_prev) { - df_prev = list_prev(buflist, df); + while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); - list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } - mutex_exit(&l2arc_free_on_write_mtx); } @@ -8651,7 +8314,8 @@ top: ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); - zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + (void) zfs_refcount_remove(&dev->l2ad_lb_count, + lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); @@ -8676,14 +8340,15 @@ top: * block pointer in the header. */ if (i == 0) { - bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + memset(l2dhdr, 0, + dev->l2ad_dev_hdr_asize); } else { - bzero(&l2dhdr->dh_start_lbps[i], + memset(&l2dhdr->dh_start_lbps[i], 0, sizeof (l2arc_log_blkptr_t)); } break; } - bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); @@ -8732,7 +8397,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8769,7 +8434,7 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, - ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); + ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -8981,7 +8646,7 @@ l2arc_sublist_lock(int list_num) * sublists being selected. */ idx = multilist_get_random_index(ml); - return (multilist_sublist_lock(ml, idx)); + return (multilist_sublist_lock_idx(ml, idx)); } /* @@ -9026,22 +8691,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) buflist = &dev->l2ad_buflist; - /* - * We need to add in the worst case scenario of log block overhead. - */ - distance += l2arc_log_blk_overhead(distance, dev); - if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { - /* - * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) - * times the write size, whichever is greater. - */ - distance += MAX(64 * 1024 * 1024, - (distance * l2arc_trim_ahead) / 100); - } - top: rerun = B_FALSE; - if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { + if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands @@ -9134,7 +8786,8 @@ retry: ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); - zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + (void) zfs_refcount_remove(&dev->l2ad_lb_count, + lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); @@ -9190,7 +8843,7 @@ retry: * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ - arc_change_state(arc_anon, hdr, hash_lock); + arc_change_state(arc_anon, hdr); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); @@ -9233,9 +8886,9 @@ out: * assertions may be violated without functional consequences * as the device is about to be removed. */ - ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end); if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } @@ -9249,7 +8902,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; - void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); @@ -9270,12 +8922,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ - if (HDR_HAS_RABD(hdr) && asize != psize) { - ASSERT3U(asize, >=, psize); + if (HDR_HAS_RABD(hdr)) { + ASSERT3U(asize, >, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); - if (psize != asize) - abd_zero_off(to_write, psize, asize - psize); + abd_zero_off(to_write, psize, asize - psize); goto out; } @@ -9284,36 +8935,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); - if (size != asize) + if (asize > size) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - cabd = abd_alloc_for_io(asize, ismd); - tmp = abd_borrow_buf(cabd, asize); - - psize = zio_compress_data(compress, to_write, tmp, size, - hdr->b_complevel); - - if (psize >= size) { - abd_return_buf(cabd, tmp, asize); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); - to_write = cabd; - abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); - if (size != asize) - abd_zero_off(to_write, size, asize - size); - goto encrypt; + size_t bufsize = MAX(size, asize); + void *buf = zio_buf_alloc(bufsize); + uint64_t csize = zio_compress_data(compress, to_write, &buf, + size, hdr->b_complevel); + if (csize > psize) { + /* + * We can't re-compress the block into the original + * psize. Even if it fits into asize, it does not + * matter, since checksum will never match on read. + */ + zio_buf_free(buf, bufsize); + return (SET_ERROR(EIO)); } - ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); - if (psize < asize) - bzero((char *)tmp + psize, asize - psize); - psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, asize); - to_write = cabd; + if (asize > csize) + memset((char *)buf + csize, 0, asize - csize); + to_write = cabd = abd_get_from_buf(buf, bufsize); + abd_take_ownership_of_buf(cabd, B_TRUE); } -encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); @@ -9342,7 +8988,7 @@ encrypt: abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ - ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); + ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) @@ -9394,9 +9040,9 @@ l2arc_blk_fetch_done(zio_t *zio) static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; + arc_buf_hdr_t *hdr, *head, *marker; + uint64_t write_asize, write_psize, headroom; + boolean_t full, from_head = !arc_warm; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); @@ -9405,10 +9051,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_lsize = write_asize = write_psize = 0; + write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); + marker = arc_state_alloc_marker(); /* * Copy buffers for L2ARC writing. @@ -9423,40 +9070,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) continue; } - multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; - - VERIFY3P(mls, !=, NULL); + headroom = target_sz * l2arc_headroom; + if (zfs_compressed_arc_enabled) + headroom = (headroom * l2arc_headroom_boost) / 100; /* - * L2ARC fast warmup. - * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ - if (arc_warm == B_FALSE) + multilist_sublist_t *mls = l2arc_sublist_lock(pass); + ASSERT3P(mls, !=, NULL); + if (from_head) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); - headroom = target_sz * l2arc_headroom; - if (zfs_compressed_arc_enabled) - headroom = (headroom * l2arc_headroom_boost) / 100; - - for (; hdr; hdr = hdr_prev) { + while (hdr != NULL) { kmutex_t *hash_lock; abd_t *to_write = NULL; - if (arc_warm == B_FALSE) - hdr_prev = multilist_sublist_next(mls, hdr); - else - hdr_prev = multilist_sublist_prev(mls, hdr); - hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { - /* - * Skip this buffer rather than waiting. - */ +skip: + /* Skip this buffer rather than waiting. */ + if (from_head) + hdr = multilist_sublist_next(mls, hdr); + else + hdr = multilist_sublist_prev(mls, hdr); continue; } @@ -9471,17 +9112,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); - continue; + goto skip; } - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || @@ -9490,25 +9124,31 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); - if ((write_asize + asize) > target_sz) { + /* + * If the allocated size of this buffer plus the max + * size for the pending log block exceeds the evicted + * target size, terminate writing buffers for this run. + */ + if (write_asize + asize + + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. + * We should not sleep with sublist lock held or it + * may block ARC eviction. Insert a marker to save + * the position and drop the lock. */ - arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); - ASSERT(HDR_HAS_L1HDR(hdr)); - - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT(hdr->b_l1hdr.b_pabd != NULL || - HDR_HAS_RABD(hdr)); - ASSERT3U(arc_hdr_size(hdr), >, 0); + if (from_head) { + multilist_sublist_insert_after(mls, hdr, + marker); + } else { + multilist_sublist_insert_before(mls, hdr, + marker); + } + multilist_sublist_unlock(mls); /* * If this header has b_rabd, we can use this since it @@ -9539,32 +9179,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, - ARC_FLAG_L2_WRITING); + ARC_FLAG_L2CACHE); mutex_exit(hash_lock); - continue; + goto next; } l2arc_free_abd_on_write(to_write, asize, type); } + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ - mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); - mutex_exit(&dev->l2ad_mtx); + } + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + boolean_t commit = l2arc_log_blk_insert(dev, hdr); + mutex_exit(hash_lock); + + if (pio == NULL) { cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; - /* - * Create a list to save allocated abd buffers - * for l2arc_log_blk_commit(). - */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); @@ -9572,48 +9225,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ZIO_FLAG_CANFAIL); } - hdr->b_l2hdr.b_dev = dev; - hdr->b_l2hdr.b_hits = 0; - - hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - hdr->b_l2hdr.b_arcs_state = - hdr->b_l1hdr.b_state->arcs_state; - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); - - mutex_enter(&dev->l2ad_mtx); - list_insert_head(&dev->l2ad_buflist, hdr); - mutex_exit(&dev->l2ad_mtx); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - wzio = zio_write_phys(pio, dev->l2ad_vdev, - hdr->b_l2hdr.b_daddr, asize, to_write, + dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); - write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + zio_nowait(wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - mutex_exit(hash_lock); - - /* - * Append buf info to current log and commit if full. - * arcstat_l2_{size,asize} kstats are updated - * internally. - */ - if (l2arc_log_blk_insert(dev, hdr)) - l2arc_log_blk_commit(dev, pio, cb); + if (commit) { + /* l2ad_hand will be adjusted inside. */ + write_asize += + l2arc_log_blk_commit(dev, pio, cb); + } - zio_nowait(wzio); +next: + multilist_sublist_lock(mls); + if (from_head) + hdr = multilist_sublist_next(mls, marker); + else + hdr = multilist_sublist_prev(mls, marker); + multilist_sublist_remove(mls, marker); } multilist_sublist_unlock(mls); @@ -9622,9 +9261,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } + arc_state_free_marker(marker); + /* No buffers selected for writing? */ if (pio == NULL) { - ASSERT0(write_lsize); + ASSERT0(write_psize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); @@ -9664,7 +9305,7 @@ l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); - return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || + return (arc_reclaim_needed() || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } @@ -9672,10 +9313,10 @@ l2arc_hdr_limit_reached(void) * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ -/* ARGSUSED */ -static void +static __attribute__((noreturn)) void l2arc_feed_thread(void *unused) { + (void) unused; callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; @@ -9863,7 +9504,7 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { - bzero(l2dhdr, l2dhdr_asize); + memset(l2dhdr, 0, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } @@ -10111,7 +9752,7 @@ l2arc_spa_rebuild_start(spa_t *spa) /* * Main entry point for L2ARC rebuilding. */ -static void +static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; @@ -10184,7 +9825,7 @@ l2arc_rebuild(l2arc_dev_t *dev) goto out; /* Prepare the rebuild process */ - bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); + memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { @@ -10230,7 +9871,7 @@ l2arc_rebuild(l2arc_dev_t *dev) lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); - bcopy(&lbps[0], lb_ptr_buf->lb_ptr, + memcpy(lb_ptr_buf->lb_ptr, &lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); @@ -10268,7 +9909,7 @@ l2arc_rebuild(l2arc_dev_t *dev) !dev->l2ad_first) goto out; - cond_resched(); + kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { @@ -10328,7 +9969,7 @@ out: */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); - bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); } else if (err == ECANCELED) { /* @@ -10370,8 +10011,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); @@ -10554,7 +10194,7 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ - arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); + arc_adapt(log_entries * HDR_L2ONLY_SIZE); for (int i = log_entries - 1; i >= 0; i--) { /* @@ -10691,11 +10331,10 @@ l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); @@ -10761,7 +10400,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ -static void +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; @@ -10769,12 +10408,11 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; - uint8_t *tmpbuf; + uint8_t *tmpbuf = NULL; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); - tmpbuf = zio_buf_alloc(sizeof (*lb)); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); @@ -10793,7 +10431,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, - abd_buf->abd, tmpbuf, sizeof (*lb), 0); + abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); @@ -10819,13 +10457,13 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ - bzero(tmpbuf + psize, asize - psize); + memset(tmpbuf + psize, 0, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ - bcopy(lb, tmpbuf, sizeof (*lb)); + memcpy(tmpbuf, lb, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); @@ -10851,7 +10489,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ - bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, + memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); @@ -10873,6 +10511,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; + + return (asize); } /* @@ -10940,7 +10580,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; - bzero(le, sizeof (*le)); + memset(le, 0, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; @@ -10953,7 +10593,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); - L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); @@ -11009,79 +10649,56 @@ EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, - param_get_long, ZMOD_RW, "Min arc size"); + spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, - param_get_long, ZMOD_RW, "Max arc size"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, - param_get_long, ZMOD_RW, "Metadata limit for arc size"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, - param_set_arc_long, param_get_long, ZMOD_RW, - "Percent of arc size for arc meta limit"); - -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, - param_get_long, ZMOD_RW, "Min arc metadata"); + spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, - "Meta objects to scan for prune"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW, - "Limit number of restarts in arc_evict_meta"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW, - "Meta reclaim strategy"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW, + "Balance between metadata and data on ghost hits."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, - param_get_int, ZMOD_RW, "Seconds before growing arc size"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, - "Disable arc_p adapt dampener"); + param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, - param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)"); + param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, - "Percent of pagecache to reclaim arc to"); + "Percent of pagecache to reclaim ARC to"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, - param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p"); - -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, - "Disable compressed arc buffers"); + "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, - param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); + param_get_uint, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, - param_set_arc_int, param_get_int, ZMOD_RW, + param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prescient prefetched block in ms"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, "Extra write bytes during device warmup"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW, "Compressed l2arc_headroom multiplier"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW, "Seconds between L2ARC writing"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, @@ -11093,41 +10710,42 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, - "If set to 1 exclude dbufs on special vdevs from being cached to " - "L2ARC."); + "Exclude dbufs on special vdevs from being cached to L2ARC if set."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, - param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); + param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, - param_get_long, ZMOD_RW, "System free memory target size in bytes"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64, + spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, - param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64, + spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, - param_set_arc_long, param_get_long, ZMOD_RW, + param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); -/* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, + "Number of arc_prune threads"); diff --git a/sys/contrib/openzfs/module/zfs/blake3_zfs.c b/sys/contrib/openzfs/module/zfs/blake3_zfs.c new file mode 100644 index 000000000000..7783282b671a --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/blake3_zfs.c @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2022 Tino Reichardt <milky-zfs@mcmilk.de> + */ + +#include <sys/zfs_context.h> +#include <sys/zio_checksum.h> +#include <sys/blake3.h> +#include <sys/abd.h> + +static int +blake3_incremental(void *buf, size_t size, void *arg) +{ + BLAKE3_CTX *ctx = arg; + + Blake3_Update(ctx, buf, size); + + return (0); +} + +/* + * Computes a native 256-bit BLAKE3 MAC checksum. Please note that this + * function requires the presence of a ctx_template that should be allocated + * using abd_checksum_blake3_tmpl_init. + */ +void +abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template, + zio_cksum_t *zcp) +{ + ASSERT(ctx_template != NULL); + +#if defined(_KERNEL) + kpreempt_disable(); + BLAKE3_CTX *ctx = blake3_per_cpu_ctx[CPU_SEQID]; +#else + BLAKE3_CTX *ctx = kmem_alloc(sizeof (*ctx), KM_SLEEP); +#endif + + memcpy(ctx, ctx_template, sizeof (*ctx)); + (void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx); + Blake3_Final(ctx, (uint8_t *)zcp); + +#if defined(_KERNEL) + kpreempt_enable(); +#else + memset(ctx, 0, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +#endif +} + +/* + * Byteswapped version of abd_checksum_blake3_native. This just invokes + * the native checksum function and byteswaps the resulting checksum (since + * BLAKE3 is internally endian-insensitive). + */ +void +abd_checksum_blake3_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + ASSERT(ctx_template != NULL); + + abd_checksum_blake3_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} + +/* + * Allocates a BLAKE3 MAC template suitable for using in BLAKE3 MAC checksum + * computations and returns a pointer to it. + */ +void * +abd_checksum_blake3_tmpl_init(const zio_cksum_salt_t *salt) +{ + BLAKE3_CTX *ctx; + + ASSERT(sizeof (salt->zcs_bytes) == 32); + + /* init reference object */ + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + Blake3_InitKeyed(ctx, salt->zcs_bytes); + + return (ctx); +} + +/* + * Frees a BLAKE3 context template previously allocated using + * zio_checksum_blake3_tmpl_init. + */ +void +abd_checksum_blake3_tmpl_free(void *ctx_template) +{ + BLAKE3_CTX *ctx = ctx_template; + + memset(ctx, 0, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/sys/contrib/openzfs/module/zfs/blkptr.c b/sys/contrib/openzfs/module/zfs/blkptr.c index aa09ded8dba3..d85f0737f6f6 100644 --- a/sys/contrib/openzfs/module/zfs/blkptr.c +++ b/sys/contrib/openzfs/module/zfs/blkptr.c @@ -58,7 +58,7 @@ encode_embedded_bp_compressed(blkptr_t *bp, void *data, ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); - bzero(bp, sizeof (*bp)); + memset(bp, 0, sizeof (*bp)); BP_SET_EMBEDDED(bp, B_TRUE); BP_SET_COMPRESS(bp, comp); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c index 47ea364ef26f..da7360f8ce10 100644 --- a/sys/contrib/openzfs/module/zfs/bplist.c +++ b/sys/contrib/openzfs/module/zfs/bplist.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -65,9 +65,8 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { + while ((bpe = list_remove_head(&bpl->bpl_list))) { bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); func(arg, &bpe->bpe_blk, tx); kmem_free(bpe, sizeof (*bpe)); @@ -82,10 +81,7 @@ bplist_clear(bplist_t *bpl) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { - bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); + while ((bpe = list_remove_head(&bpl->bpl_list))) kmem_free(bpe, sizeof (*bpe)); - } mutex_exit(&bpl->bpl_lock); } diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c index e75ba5cccde6..96e1601c4e9c 100644 --- a/sys/contrib/openzfs/module/zfs/bpobj.c +++ b/sys/contrib/openzfs/module/zfs/bpobj.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -156,7 +156,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) if (err) return (err); - bzero(bpo, sizeof (*bpo)); + memset(bpo, 0, sizeof (*bpo)); mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); ASSERT(bpo->bpo_dbuf == NULL); @@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; - for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { + int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; + uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) * + sizeof (blkptr_t); + uint64_t ps = start * sizeof (blkptr_t); + uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0, + ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb, + ZIO_PRIORITY_ASYNC_READ); + } + for (; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); @@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, if (dbuf) dmu_buf_rele(dbuf, FTAG); err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, - offset, FTAG, &dbuf, 0); + offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH); if (err) break; + pe = pb; + pb = MAX((dbuf->db_offset > dmu_prefetch_max) ? + dbuf->db_offset - dmu_prefetch_max : 0, ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + pb, pe - pb, ZIO_PRIORITY_ASYNC_READ); + } } ASSERT3U(offset, >=, dbuf->db_offset); @@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, int64_t i = bpi->bpi_unprocessed_subobjs - 1; uint64_t offset = i * sizeof (uint64_t); - uint64_t obj_from_sublist; + uint64_t subobj; err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - offset, sizeof (uint64_t), &obj_from_sublist, - DMU_READ_PREFETCH); + offset, sizeof (uint64_t), &subobj, + DMU_READ_NO_PREFETCH); if (err) break; - bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), - KM_SLEEP); - err = bpobj_open(sublist, bpo->bpo_os, - obj_from_sublist); - if (err) + bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t), + KM_SLEEP); + err = bpobj_open(subbpo, bpo->bpo_os, subobj); + if (err) { + kmem_free(subbpo, sizeof (bpobj_t)); break; + } - list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); - mutex_enter(&sublist->bpo_lock); + if (subbpo->bpo_havesubobj && + subbpo->bpo_phys->bpo_subobjs != 0) { + dmu_prefetch(subbpo->bpo_os, + subbpo->bpo_phys->bpo_subobjs, 0, 0, 0, + ZIO_PRIORITY_ASYNC_READ); + } + + list_insert_head(&stack, bpi_alloc(subbpo, bpi, i)); + mutex_enter(&subbpo->bpo_lock); bpi->bpi_unprocessed_subobjs--; } } @@ -663,14 +688,13 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); - VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (bpobj_is_empty(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); return; } + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); mutex_enter(&bpo->bpo_lock); dmu_buf_will_dirty(bpo->bpo_dbuf, tx); @@ -780,6 +804,68 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) } +/* + * Prefetch metadata required for bpobj_enqueue_subobj(). + */ +void +bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) +{ + dmu_object_info_t doi; + bpobj_t subbpo; + uint64_t subsubobjs; + boolean_t copy_subsub = B_TRUE; + boolean_t copy_bps = B_TRUE; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) + return; + + if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) + return; + if (bpobj_is_empty(&subbpo)) { + bpobj_close(&subbpo); + return; + } + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + bpobj_close(&subbpo); + + if (subsubobjs != 0) { + if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size) + copy_subsub = B_FALSE; + } + + if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) + copy_bps = B_FALSE; + + if (copy_subsub && subsubobjs != 0) { + if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } + dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } + + if (copy_bps) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, + ZIO_PRIORITY_ASYNC_READ); + dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } else if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } +} + void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) @@ -805,12 +891,12 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, * set of BP's stored, and bpobj_iterate() wouldn't visit * all the space accounted for in the bpobj. */ - bzero(&stored_bp, sizeof (stored_bp)); + memset(&stored_bp, 0, sizeof (stored_bp)); stored_bp.blk_prop = bp->blk_prop; - stored_bp.blk_birth = bp->blk_birth; + BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp)); } else if (!BP_GET_DEDUP(bp)) { /* The bpobj will compress better without the checksum */ - bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum)); } stored_bp.blk_fill = 0; @@ -829,6 +915,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, bpo, &bpo->bpo_cached_dbuf, 0)); + ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL); } dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); @@ -860,13 +947,14 @@ struct space_range_arg { uint64_t uncomp; }; -/* ARGSUSED */ static int space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { + (void) bp_freed, (void) tx; struct space_range_arg *sra = arg; - if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg && + BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) { if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) sra->used += bp_get_dsize_sync(sra->spa, bp); else @@ -898,7 +986,7 @@ bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) /* * Return the amount of space in the bpobj which is: - * mintxg < blk_birth <= maxtxg + * mintxg < logical birth <= maxtxg */ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, @@ -932,11 +1020,11 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, * bpobj are designated as free or allocated that information is not preserved * in bplists. */ -/* ARGSUSED */ int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { + (void) bp_freed, (void) tx; bplist_t *bpl = arg; bplist_append(bpl, bp); return (0); diff --git a/sys/contrib/openzfs/module/zfs/bptree.c b/sys/contrib/openzfs/module/zfs/bptree.c index 1827a3c4e326..1f5d8e77bcc0 100644 --- a/sys/contrib/openzfs/module/zfs/bptree.c +++ b/sys/contrib/openzfs/module/zfs/bptree.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -147,11 +147,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, dmu_buf_rele(db, FTAG); } -/* ARGSUSED */ static int bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { + (void) zilog, (void) dnp; int err; struct bptree_args *ba = arg; diff --git a/sys/contrib/openzfs/module/zfs/bqueue.c b/sys/contrib/openzfs/module/zfs/bqueue.c index 22539efc4e23..a7fa516975de 100644 --- a/sys/contrib/openzfs/module/zfs/bqueue.c +++ b/sys/contrib/openzfs/module/zfs/bqueue.c @@ -27,34 +27,46 @@ obj2node(bqueue_t *q, void *data) /* * Initialize a blocking queue The maximum capacity of the queue is set to - * size. Types that are stored in a bqueue must contain a bqueue_node_t, - * and node_offset must be its offset from the start of the struct. - * fill_fraction is a performance tuning value; when the queue is full, any - * threads attempting to enqueue records will block. They will block until - * they're signaled, which will occur when the queue is at least 1/fill_fraction + * size. Types that are stored in a bqueue must contain a bqueue_node_t, and + * node_offset must be its offset from the start of the struct. fill_fraction + * is a performance tuning value; when the queue is full, any threads + * attempting to enqueue records will block. They will block until they're + * signaled, which will occur when the queue is at least 1/fill_fraction * empty. Similar behavior occurs on dequeue; if the queue is empty, threads - * block. They will be signalled when the queue has 1/fill_fraction full, or - * when bqueue_flush is called. As a result, you must call bqueue_flush when - * you enqueue your final record on a thread, in case the dequeueing threads are - * currently blocked and that enqueue does not cause them to be awoken. - * Alternatively, this behavior can be disabled (causing signaling to happen - * immediately) by setting fill_fraction to any value larger than size. - * Return 0 on success, or -1 on failure. + * block. They will be signalled when the queue has 1/fill_fraction full. + * As a result, you must call bqueue_enqueue_flush() when you enqueue your + * final record on a thread, in case the dequeuing threads are currently + * blocked and that enqueue does not cause them to be woken. Alternatively, + * this behavior can be disabled (causing signaling to happen immediately) by + * setting fill_fraction to any value larger than size. Return 0 on success, + * or -1 on failure. + * + * Note: The caller must ensure that for a given bqueue_t, there's only a + * single call to bqueue_enqueue() running at a time (e.g. by calling only + * from a single thread, or with locking around the call). Similarly, the + * caller must ensure that there's only a single call to bqueue_dequeue() + * running at a time. However, the one call to bqueue_enqueue() may be + * invoked concurrently with the one call to bqueue_dequeue(). */ int -bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size, - size_t node_offset) +bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset) { if (fill_fraction == 0) { return (-1); } list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), node_offset + offsetof(bqueue_node_t, bqn_node)); + list_create(&q->bq_dequeuing_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); + list_create(&q->bq_enqueuing_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); q->bq_node_offset = node_offset; q->bq_size = 0; + q->bq_dequeuing_size = 0; + q->bq_enqueuing_size = 0; q->bq_maxsize = size; q->bq_fill_fraction = fill_fraction; return (0); @@ -70,31 +82,40 @@ bqueue_destroy(bqueue_t *q) { mutex_enter(&q->bq_lock); ASSERT0(q->bq_size); + ASSERT0(q->bq_dequeuing_size); + ASSERT0(q->bq_enqueuing_size); cv_destroy(&q->bq_add_cv); cv_destroy(&q->bq_pop_cv); list_destroy(&q->bq_list); + list_destroy(&q->bq_dequeuing_list); + list_destroy(&q->bq_enqueuing_list); mutex_exit(&q->bq_lock); mutex_destroy(&q->bq_lock); } static void -bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, - boolean_t flush) +bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush) { ASSERT3U(item_size, >, 0); ASSERT3U(item_size, <=, q->bq_maxsize); - mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; - while (q->bq_size + item_size > q->bq_maxsize) { - cv_wait_sig(&q->bq_add_cv, &q->bq_lock); - } - q->bq_size += item_size; - list_insert_tail(&q->bq_list, data); - if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) - cv_signal(&q->bq_pop_cv); - if (flush) + q->bq_enqueuing_size += item_size; + list_insert_tail(&q->bq_enqueuing_list, data); + + if (flush || + q->bq_enqueuing_size >= q->bq_maxsize / q->bq_fill_fraction) { + /* Append the enquing list to the shared list. */ + mutex_enter(&q->bq_lock); + while (q->bq_size > q->bq_maxsize) { + cv_wait_sig(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += q->bq_enqueuing_size; + list_move_tail(&q->bq_list, &q->bq_enqueuing_list); + q->bq_enqueuing_size = 0; cv_broadcast(&q->bq_pop_cv); - mutex_exit(&q->bq_lock); + mutex_exit(&q->bq_lock); + } } /* @@ -103,7 +124,7 @@ bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, * > 0. */ void -bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_FALSE); } @@ -112,12 +133,12 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) * Enqueue an entry, and then flush the queue. This forces the popping threads * to wake up, even if we're below the fill fraction. We have this in a single * function, rather than having a separate call, because it prevents race - * conditions between the enqueuing thread and the dequeueing thread, where the - * enqueueing thread will wake up the dequeueing thread, that thread will + * conditions between the enqueuing thread and the dequeuing thread, where the + * enqueueing thread will wake up the dequeuing thread, that thread will * destroy the condvar before the enqueuing thread is done. */ void -bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_TRUE); } @@ -129,27 +150,26 @@ bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) void * bqueue_dequeue(bqueue_t *q) { - void *ret = NULL; - uint64_t item_size; - mutex_enter(&q->bq_lock); - while (q->bq_size == 0) { - cv_wait_sig(&q->bq_pop_cv, &q->bq_lock); + void *ret = list_remove_head(&q->bq_dequeuing_list); + if (ret == NULL) { + /* + * Dequeuing list is empty. Wait for there to be something on + * the shared list, then move the entire shared list to the + * dequeuing list. + */ + mutex_enter(&q->bq_lock); + while (q->bq_size == 0) { + cv_wait_sig(&q->bq_pop_cv, &q->bq_lock); + } + ASSERT0(q->bq_dequeuing_size); + ASSERT(list_is_empty(&q->bq_dequeuing_list)); + list_move_tail(&q->bq_dequeuing_list, &q->bq_list); + q->bq_dequeuing_size = q->bq_size; + q->bq_size = 0; + cv_broadcast(&q->bq_add_cv); + mutex_exit(&q->bq_lock); + ret = list_remove_head(&q->bq_dequeuing_list); } - ret = list_remove_head(&q->bq_list); - ASSERT3P(ret, !=, NULL); - item_size = obj2node(q, ret)->bqn_size; - q->bq_size -= item_size; - if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction)) - cv_signal(&q->bq_add_cv); - mutex_exit(&q->bq_lock); + q->bq_dequeuing_size -= obj2node(q, ret)->bqn_size; return (ret); } - -/* - * Returns true if the space used is 0. - */ -boolean_t -bqueue_empty(bqueue_t *q) -{ - return (q->bq_size == 0); -} diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c new file mode 100644 index 000000000000..ea8c0735c4b7 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/brt.c @@ -0,0 +1,1673 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zio.h> +#include <sys/brt.h> +#include <sys/brt_impl.h> +#include <sys/ddt.h> +#include <sys/bitmap.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> +#include <sys/arc.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_scan.h> +#include <sys/vdev_impl.h> +#include <sys/kstat.h> +#include <sys/wmsum.h> + +/* + * Block Cloning design. + * + * Block Cloning allows to manually clone a file (or a subset of its blocks) + * into another (or the same) file by just creating additional references to + * the data blocks without copying the data itself. Those references are kept + * in the Block Reference Tables (BRTs). + * + * In many ways this is similar to the existing deduplication, but there are + * some important differences: + * + * - Deduplication is automatic and Block Cloning is not - one has to use a + * dedicated system call(s) to clone the given file/blocks. + * - Deduplication keeps all data blocks in its table, even those referenced + * just once. Block Cloning creates an entry in its tables only when there + * are at least two references to the given data block. If the block was + * never explicitly cloned or the second to last reference was dropped, + * there will be neither space nor performance overhead. + * - Deduplication needs data to work - one needs to pass real data to the + * write(2) syscall, so hash can be calculated. Block Cloning doesn't require + * data, just block pointers to the data, so it is extremely fast, as we pay + * neither the cost of reading the data, nor the cost of writing the data - + * we operate exclusively on metadata. + * - If the D (dedup) bit is not set in the block pointer, it means that + * the block is not in the dedup table (DDT) and we won't consult the DDT + * when we need to free the block. Block Cloning must be consulted on every + * free, because we cannot modify the source BP (eg. by setting something + * similar to the D bit), thus we have no hint if the block is in the + * Block Reference Table (BRT), so we need to look into the BRT. There is + * an optimization in place that allows us to eliminate the majority of BRT + * lookups which is described below in the "Minimizing free penalty" section. + * - The BRT entry is much smaller than the DDT entry - for BRT we only store + * 64bit offset and 64bit reference counter. + * - Dedup keys are cryptographic hashes, so two blocks that are close to each + * other on disk are most likely in totally different parts of the DDT. + * The BRT entry keys are offsets into a single top-level VDEV, so data blocks + * from one file should have BRT entries close to each other. + * - Scrub will only do a single pass over a block that is referenced multiple + * times in the DDT. Unfortunately it is not currently (if at all) possible + * with Block Cloning and block referenced multiple times will be scrubbed + * multiple times. The new, sorted scrub should be able to eliminate + * duplicated reads given enough memory. + * - Deduplication requires cryptographically strong hash as a checksum or + * additional data verification. Block Cloning works with any checksum + * algorithm or even with checksumming disabled. + * + * As mentioned above, the BRT entries are much smaller than the DDT entries. + * To uniquely identify a block we just need its vdev id and offset. We also + * need to maintain a reference counter. The vdev id will often repeat, as there + * is a small number of top-level VDEVs and a large number of blocks stored in + * each VDEV. We take advantage of that to reduce the BRT entry size further by + * maintaining one BRT for each top-level VDEV, so we can then have only offset + * and counter as the BRT entry. + * + * Minimizing free penalty. + * + * Block Cloning allows creating additional references to any existing block. + * When we free a block there is no hint in the block pointer whether the block + * was cloned or not, so on each free we have to check if there is a + * corresponding entry in the BRT or not. If there is, we need to decrease + * the reference counter. Doing BRT lookup on every free can potentially be + * expensive by requiring additional I/Os if the BRT doesn't fit into memory. + * This is the main problem with deduplication, so we've learned our lesson and + * try not to repeat the same mistake here. How do we do that? We divide each + * top-level VDEV into 16MB regions. For each region we maintain a counter that + * is a sum of all the BRT entries that have offsets within the region. This + * creates the entries count array of 16bit numbers for each top-level VDEV. + * The entries count array is always kept in memory and updated on disk in the + * same transaction group as the BRT updates to keep everything in-sync. We can + * keep the array in memory, because it is very small. With 16MB regions and + * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease + * the region size even further in the future). Now, when we want to free + * a block, we first consult the array. If the counter for the whole region is + * zero, there is no need to look for the BRT entry, as there isn't one for + * sure. If the counter for the region is greater than zero, only then we will + * do a BRT lookup and if an entry is found we will decrease the reference + * counter in the BRT entry and in the entry counters array. + * + * The entry counters array is small, but can potentially be larger for very + * large VDEVs or smaller regions. In this case we don't want to rewrite entire + * array on every change. We then divide the array into 32kB block and keep + * a bitmap of dirty blocks within a transaction group. When we sync the + * transaction group we can only update the parts of the entry counters array + * that were modified. Note: Keeping track of the dirty parts of the entry + * counters array is implemented, but updating only parts of the array on disk + * is not yet implemented - for now we will update entire array if there was + * any change. + * + * The implementation tries to be economic: if BRT is not used, or no longer + * used, there will be no entries in the MOS and no additional memory used (eg. + * the entry counters array is only allocated if needed). + * + * Interaction between Deduplication and Block Cloning. + * + * If both functionalities are in use, we could end up with a block that is + * referenced multiple times in both DDT and BRT. When we free one of the + * references we couldn't tell where it belongs, so we would have to decide + * what table takes the precedence: do we first clear DDT references or BRT + * references? To avoid this dilemma BRT cooperates with DDT - if a given block + * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will + * lookup DDT entry instead and increase the counter there. No BRT entry + * will be created for a block which has the D (dedup) bit set. + * BRT may be more efficient for manual deduplication, but if the block is + * already in the DDT, then creating additional BRT entry would be less + * efficient. This clever idea was proposed by Allan Jude. + * + * Block Cloning across datasets. + * + * Block Cloning is not limited to cloning blocks within the same dataset. + * It is possible (and very useful) to clone blocks between different datasets. + * One use case is recovering files from snapshots. By cloning the files into + * dataset we need no additional storage. Without Block Cloning we would need + * additional space for those files. + * Another interesting use case is moving the files between datasets + * (copying the file content to the new dataset and removing the source file). + * In that case Block Cloning will only be used briefly, because the BRT entries + * will be removed when the source is removed. + * Block Cloning across encrypted datasets is supported as long as both + * datasets share the same master key (e.g. snapshots and clones) + * + * Block Cloning flow through ZFS layers. + * + * Note: Block Cloning can be used both for cloning file system blocks and ZVOL + * blocks. As of this writing no interface is implemented that allows for block + * cloning within a ZVOL. + * FreeBSD and Linux provides copy_file_range(2) system call and we will use it + * for blocking cloning. + * + * ssize_t + * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, + * size_t len, unsigned int flags); + * + * Even though offsets and length represent bytes, they have to be + * block-aligned or we will return an error so the upper layer can + * fallback to the generic mechanism that will just copy the data. + * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. + * This function was implemented based on zfs_write(), but instead of writing + * the given data we first read block pointers using the new dmu_read_l0_bps() + * function from the source file. Once we have BPs from the source file we call + * the dmu_brt_clone() function on the destination file. This function + * allocates BPs for us. We iterate over all source BPs. If the given BP is + * a hole or an embedded block, we just copy BP as-is. If it points to a real + * data we place this BP on a BRT pending list using the brt_pending_add() + * function. + * + * We use this pending list to keep track of all BPs that got new references + * within this transaction group. + * + * Some special cases to consider and how we address them: + * - The block we want to clone may have been created within the same + * transaction group that we are trying to clone. Such block has no BP + * allocated yet, so cannot be immediately cloned. We return EAGAIN. + * - The block we want to clone may have been modified within the same + * transaction group. We return EAGAIN. + * - A block may be cloned multiple times during one transaction group (that's + * why pending list is actually a tree and not an append-only list - this + * way we can figure out faster if this block is cloned for the first time + * in this txg or consecutive time). + * - A block may be cloned and freed within the same transaction group + * (see dbuf_undirty()). + * - A block may be cloned and within the same transaction group the clone + * can be cloned again (see dmu_read_l0_bps()). + * - A file might have been deleted, but the caller still has a file descriptor + * open to this file and clones it. + * + * When we free a block we have an additional step in the ZIO pipeline where we + * call the zio_brt_free() function. We then call the brt_entry_decref() + * that loads the corresponding BRT entry (if one exists) and decreases + * reference counter. If this is not the last reference we will stop ZIO + * pipeline here. If this is the last reference or the block is not in the + * BRT, we continue the pipeline and free the block as usual. + * + * At the beginning of spa_sync() where there can be no more block cloning, + * but before issuing frees we call brt_pending_apply(). This function applies + * all the new clones to the BRT table - we load BRT entries and update + * reference counters. To sync new BRT entries to disk, we use brt_sync() + * function. This function will sync all dirty per-top-level-vdev BRTs, + * the entry counters arrays, etc. + * + * Block Cloning and ZIL. + * + * Every clone operation is divided into chunks (similar to write) and each + * chunk is cloned in a separate transaction. The chunk size is determined by + * how many BPs we can fit into a single ZIL entry. + * Replaying clone operation is different from the regular clone operation, + * as when we log clone operations we cannot use the source object - it may + * reside on a different dataset, so we log BPs we want to clone. + * The ZIL is replayed when we mount the given dataset, not when the pool is + * imported. Taking this into account it is possible that the pool is imported + * without mounting datasets and the source dataset is destroyed before the + * destination dataset is mounted and its ZIL replayed. + * To address this situation we leverage zil_claim() mechanism where ZFS will + * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE + * entries, we will bump reference counters for their BPs in the BRT. Then + * on mount and ZIL replay we bump the reference counters once more, while the + * first references are dropped during ZIL destroy by zil_free_clone_range(). + * It is possible that after zil_claim() we never mount the destination, so + * we never replay its ZIL and just destroy it. In this case the only taken + * references will be dropped by zil_free_clone_range(), since the cloning is + * not going to ever take place. + */ + +static kmem_cache_t *brt_entry_cache; +static kmem_cache_t *brt_pending_entry_cache; + +/* + * Enable/disable prefetching of BRT entries that we are going to modify. + */ +static int brt_zap_prefetch = 1; + +#ifdef ZFS_DEBUG +#define BRT_DEBUG(...) do { \ + if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ + __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ + } \ +} while (0) +#else +#define BRT_DEBUG(...) do { } while (0) +#endif + +static int brt_zap_default_bs = 12; +static int brt_zap_default_ibs = 12; + +static kstat_t *brt_ksp; + +typedef struct brt_stats { + kstat_named_t brt_addref_entry_in_memory; + kstat_named_t brt_addref_entry_not_on_disk; + kstat_named_t brt_addref_entry_on_disk; + kstat_named_t brt_addref_entry_read_lost_race; + kstat_named_t brt_decref_entry_in_memory; + kstat_named_t brt_decref_entry_loaded_from_disk; + kstat_named_t brt_decref_entry_not_in_memory; + kstat_named_t brt_decref_entry_not_on_disk; + kstat_named_t brt_decref_entry_read_lost_race; + kstat_named_t brt_decref_entry_still_referenced; + kstat_named_t brt_decref_free_data_later; + kstat_named_t brt_decref_free_data_now; + kstat_named_t brt_decref_no_entry; +} brt_stats_t; + +static brt_stats_t brt_stats = { + { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, + { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, + { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, + { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, + { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, + { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, + { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, + { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, + { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, + { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, + { "decref_free_data_later", KSTAT_DATA_UINT64 }, + { "decref_free_data_now", KSTAT_DATA_UINT64 }, + { "decref_no_entry", KSTAT_DATA_UINT64 } +}; + +struct { + wmsum_t brt_addref_entry_in_memory; + wmsum_t brt_addref_entry_not_on_disk; + wmsum_t brt_addref_entry_on_disk; + wmsum_t brt_addref_entry_read_lost_race; + wmsum_t brt_decref_entry_in_memory; + wmsum_t brt_decref_entry_loaded_from_disk; + wmsum_t brt_decref_entry_not_in_memory; + wmsum_t brt_decref_entry_not_on_disk; + wmsum_t brt_decref_entry_read_lost_race; + wmsum_t brt_decref_entry_still_referenced; + wmsum_t brt_decref_free_data_later; + wmsum_t brt_decref_free_data_now; + wmsum_t brt_decref_no_entry; +} brt_sums; + +#define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) + +static int brt_entry_compare(const void *x1, const void *x2); +static int brt_pending_entry_compare(const void *x1, const void *x2); + +static void +brt_rlock(brt_t *brt) +{ + rw_enter(&brt->brt_lock, RW_READER); +} + +static void +brt_wlock(brt_t *brt) +{ + rw_enter(&brt->brt_lock, RW_WRITER); +} + +static void +brt_unlock(brt_t *brt) +{ + rw_exit(&brt->brt_lock); +} + +static uint16_t +brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) +{ + + ASSERT3U(idx, <, brtvd->bv_size); + + if (unlikely(brtvd->bv_need_byteswap)) { + return (BSWAP_16(brtvd->bv_entcount[idx])); + } else { + return (brtvd->bv_entcount[idx]); + } +} + +static void +brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) +{ + + ASSERT3U(idx, <, brtvd->bv_size); + + if (unlikely(brtvd->bv_need_byteswap)) { + brtvd->bv_entcount[idx] = BSWAP_16(entcnt); + } else { + brtvd->bv_entcount[idx] = entcnt; + } +} + +static void +brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) +{ + uint16_t entcnt; + + ASSERT3U(idx, <, brtvd->bv_size); + + entcnt = brt_vdev_entcount_get(brtvd, idx); + ASSERT(entcnt < UINT16_MAX); + + brt_vdev_entcount_set(brtvd, idx, entcnt + 1); +} + +static void +brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) +{ + uint16_t entcnt; + + ASSERT3U(idx, <, brtvd->bv_size); + + entcnt = brt_vdev_entcount_get(brtvd, idx); + ASSERT(entcnt > 0); + + brt_vdev_entcount_set(brtvd, idx, entcnt - 1); +} + +#ifdef ZFS_DEBUG +static void +brt_vdev_dump(brt_vdev_t *brtvd) +{ + uint64_t idx; + + zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + (u_longlong_t)brtvd->bv_vdevid, + brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, + (u_longlong_t)brtvd->bv_size, + (u_longlong_t)brtvd->bv_totalcount, + (u_longlong_t)brtvd->bv_nblocks, + (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + if (brtvd->bv_totalcount > 0) { + zfs_dbgmsg(" entcounts:"); + for (idx = 0; idx < brtvd->bv_size; idx++) { + uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); + if (entcnt > 0) { + zfs_dbgmsg(" [%04llu] %hu", + (u_longlong_t)idx, entcnt); + } + } + } + if (brtvd->bv_entcount_dirty) { + char *bitmap; + + bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); + for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap[idx] = + BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; + } + bitmap[idx] = '\0'; + zfs_dbgmsg(" dirty: %s", bitmap); + kmem_free(bitmap, brtvd->bv_nblocks + 1); + } +} +#endif + +static brt_vdev_t * +brt_vdev(brt_t *brt, uint64_t vdevid) +{ + brt_vdev_t *brtvd; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + + if (vdevid < brt->brt_nvdevs) { + brtvd = &brt->brt_vdevs[vdevid]; + } else { + brtvd = NULL; + } + + return (brtvd); +} + +static void +brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +{ + char name[64]; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT0(brtvd->bv_mos_brtvdev); + ASSERT0(brtvd->bv_mos_entries); + ASSERT(brtvd->bv_entcount != NULL); + ASSERT(brtvd->bv_size > 0); + ASSERT(brtvd->bv_bitmap != NULL); + ASSERT(brtvd->bv_nblocks > 0); + + brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, + ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, + brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); + VERIFY(brtvd->bv_mos_entries != 0); + BRT_DEBUG("MOS entries created, object=%llu", + (u_longlong_t)brtvd->bv_mos_entries); + + /* + * We allocate DMU buffer to store the bv_entcount[] array. + * We will keep array size (bv_size) and cummulative count for all + * bv_entcount[]s (bv_totalcount) in the bonus buffer. + */ + brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, + DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); + VERIFY(brtvd->bv_mos_brtvdev != 0); + BRT_DEBUG("MOS BRT VDEV created, object=%llu", + (u_longlong_t)brtvd->bv_mos_brtvdev); + + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)brtvd->bv_vdevid); + VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); + BRT_DEBUG("Pool directory object created, object=%s", name); + + spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); +} + +static void +brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) +{ + vdev_t *vd; + uint16_t *entcount; + ulong_t *bitmap; + uint64_t nblocks, size; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + + spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); + size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; + spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); + + entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); + nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); + bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); + + if (!brtvd->bv_initiated) { + ASSERT0(brtvd->bv_size); + ASSERT(brtvd->bv_entcount == NULL); + ASSERT(brtvd->bv_bitmap == NULL); + ASSERT0(brtvd->bv_nblocks); + + avl_create(&brtvd->bv_tree, brt_entry_compare, + sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); + } else { + ASSERT(brtvd->bv_size > 0); + ASSERT(brtvd->bv_entcount != NULL); + ASSERT(brtvd->bv_bitmap != NULL); + ASSERT(brtvd->bv_nblocks > 0); + /* + * TODO: Allow vdev shrinking. We only need to implement + * shrinking the on-disk BRT VDEV object. + * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, + * size, tx); + */ + ASSERT3U(brtvd->bv_size, <=, size); + + memcpy(entcount, brtvd->bv_entcount, + sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); + memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), + BT_SIZEOFMAP(brtvd->bv_nblocks))); + vmem_free(brtvd->bv_entcount, + sizeof (entcount[0]) * brtvd->bv_size); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + } + + brtvd->bv_size = size; + brtvd->bv_entcount = entcount; + brtvd->bv_bitmap = bitmap; + brtvd->bv_nblocks = nblocks; + if (!brtvd->bv_initiated) { + brtvd->bv_need_byteswap = FALSE; + brtvd->bv_initiated = TRUE; + BRT_DEBUG("BRT VDEV %llu initiated.", + (u_longlong_t)brtvd->bv_vdevid); + } +} + +static void +brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) +{ + char name[64]; + dmu_buf_t *db; + brt_vdev_phys_t *bvphys; + int error; + + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)brtvd->bv_vdevid); + error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); + if (error != 0) + return; + ASSERT(brtvd->bv_mos_brtvdev != 0); + + error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); + ASSERT0(error); + if (error != 0) + return; + + bvphys = db->db_data; + if (brt->brt_rangesize == 0) { + brt->brt_rangesize = bvphys->bvp_rangesize; + } else { + ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); + } + + ASSERT(!brtvd->bv_initiated); + brt_vdev_realloc(brt, brtvd); + + /* TODO: We don't support VDEV shrinking. */ + ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); + + /* + * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. + */ + error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), + brtvd->bv_entcount, DMU_READ_NO_PREFETCH); + ASSERT0(error); + + brtvd->bv_mos_entries = bvphys->bvp_mos_entries; + ASSERT(brtvd->bv_mos_entries != 0); + brtvd->bv_need_byteswap = + (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); + brtvd->bv_totalcount = bvphys->bvp_totalcount; + brtvd->bv_usedspace = bvphys->bvp_usedspace; + brtvd->bv_savedspace = bvphys->bvp_savedspace; + brt->brt_usedspace += brtvd->bv_usedspace; + brt->brt_savedspace += brtvd->bv_savedspace; + + dmu_buf_rele(db, FTAG); + + BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", + name, (u_longlong_t)brtvd->bv_mos_brtvdev, + (u_longlong_t)brtvd->bv_mos_entries); +} + +static void +brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) +{ + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); + + vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); + brtvd->bv_entcount = NULL; + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + brtvd->bv_bitmap = NULL; + ASSERT0(avl_numnodes(&brtvd->bv_tree)); + avl_destroy(&brtvd->bv_tree); + + brtvd->bv_size = 0; + brtvd->bv_nblocks = 0; + + brtvd->bv_initiated = FALSE; + BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); +} + +static void +brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +{ + char name[64]; + uint64_t count; + dmu_buf_t *db; + brt_vdev_phys_t *bvphys; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_mos_brtvdev != 0); + ASSERT(brtvd->bv_mos_entries != 0); + + VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); + VERIFY0(count); + VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); + BRT_DEBUG("MOS entries destroyed, object=%llu", + (u_longlong_t)brtvd->bv_mos_entries); + brtvd->bv_mos_entries = 0; + + VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); + bvphys = db->db_data; + ASSERT0(bvphys->bvp_totalcount); + ASSERT0(bvphys->bvp_usedspace); + ASSERT0(bvphys->bvp_savedspace); + dmu_buf_rele(db, FTAG); + + VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); + BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", + (u_longlong_t)brtvd->bv_mos_brtvdev); + brtvd->bv_mos_brtvdev = 0; + + snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, + (u_longlong_t)brtvd->bv_vdevid); + VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + BRT_DEBUG("Pool directory object removed, object=%s", name); + + brt_vdev_dealloc(brt, brtvd); + + spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); +} + +static void +brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) +{ + brt_vdev_t *brtvd, *vdevs; + uint64_t vdevid; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT3U(nvdevs, >, brt->brt_nvdevs); + + vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); + if (brt->brt_nvdevs > 0) { + ASSERT(brt->brt_vdevs != NULL); + + memcpy(vdevs, brt->brt_vdevs, + sizeof (brt_vdev_t) * brt->brt_nvdevs); + kmem_free(brt->brt_vdevs, + sizeof (brt_vdev_t) * brt->brt_nvdevs); + } + for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { + brtvd = &vdevs[vdevid]; + + brtvd->bv_vdevid = vdevid; + brtvd->bv_initiated = FALSE; + } + + BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", + (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); + + brt->brt_vdevs = vdevs; + brt->brt_nvdevs = nvdevs; +} + +static boolean_t +brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) +{ + uint64_t idx; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + + idx = bre->bre_offset / brt->brt_rangesize; + if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { + /* VDEV wasn't expanded. */ + return (brt_vdev_entcount_get(brtvd, idx) > 0); + } + + return (FALSE); +} + +static void +brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, + uint64_t dsize) +{ + uint64_t idx; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + ASSERT(brtvd != NULL); + ASSERT(brtvd->bv_entcount != NULL); + + brt->brt_savedspace += dsize; + brtvd->bv_savedspace += dsize; + brtvd->bv_meta_dirty = TRUE; + + if (bre->bre_refcount > 1) { + return; + } + + brt->brt_usedspace += dsize; + brtvd->bv_usedspace += dsize; + + idx = bre->bre_offset / brt->brt_rangesize; + if (idx >= brtvd->bv_size) { + /* VDEV has been expanded. */ + brt_vdev_realloc(brt, brtvd); + } + + ASSERT3U(idx, <, brtvd->bv_size); + + brtvd->bv_totalcount++; + brt_vdev_entcount_inc(brtvd, idx); + brtvd->bv_entcount_dirty = TRUE; + idx = idx / BRT_BLOCKSIZE / 8; + BT_SET(brtvd->bv_bitmap, idx); + +#ifdef ZFS_DEBUG + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); +#endif +} + +static void +brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, + uint64_t dsize) +{ + uint64_t idx; + + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd != NULL); + ASSERT(brtvd->bv_entcount != NULL); + + brt->brt_savedspace -= dsize; + brtvd->bv_savedspace -= dsize; + brtvd->bv_meta_dirty = TRUE; + + if (bre->bre_refcount > 0) { + return; + } + + brt->brt_usedspace -= dsize; + brtvd->bv_usedspace -= dsize; + + idx = bre->bre_offset / brt->brt_rangesize; + ASSERT3U(idx, <, brtvd->bv_size); + + ASSERT(brtvd->bv_totalcount > 0); + brtvd->bv_totalcount--; + brt_vdev_entcount_dec(brtvd, idx); + brtvd->bv_entcount_dirty = TRUE; + idx = idx / BRT_BLOCKSIZE / 8; + BT_SET(brtvd->bv_bitmap, idx); + +#ifdef ZFS_DEBUG + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); +#endif +} + +static void +brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +{ + dmu_buf_t *db; + brt_vdev_phys_t *bvphys; + + ASSERT(brtvd->bv_meta_dirty); + ASSERT(brtvd->bv_mos_brtvdev != 0); + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); + + if (brtvd->bv_entcount_dirty) { + /* + * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. + */ + dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), + brtvd->bv_entcount, tx); + memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); + brtvd->bv_entcount_dirty = FALSE; + } + + dmu_buf_will_dirty(db, tx); + bvphys = db->db_data; + bvphys->bvp_mos_entries = brtvd->bv_mos_entries; + bvphys->bvp_size = brtvd->bv_size; + if (brtvd->bv_need_byteswap) { + bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; + } else { + bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; + } + bvphys->bvp_totalcount = brtvd->bv_totalcount; + bvphys->bvp_rangesize = brt->brt_rangesize; + bvphys->bvp_usedspace = brtvd->bv_usedspace; + bvphys->bvp_savedspace = brtvd->bv_savedspace; + dmu_buf_rele(db, FTAG); + + brtvd->bv_meta_dirty = FALSE; +} + +static void +brt_vdevs_alloc(brt_t *brt, boolean_t load) +{ + brt_vdev_t *brtvd; + uint64_t vdevid; + + brt_wlock(brt); + + brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); + + if (load) { + for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brtvd = &brt->brt_vdevs[vdevid]; + ASSERT(brtvd->bv_entcount == NULL); + + brt_vdev_load(brt, brtvd); + } + } + + if (brt->brt_rangesize == 0) { + brt->brt_rangesize = BRT_RANGESIZE; + } + + brt_unlock(brt); +} + +static void +brt_vdevs_free(brt_t *brt) +{ + brt_vdev_t *brtvd; + uint64_t vdevid; + + brt_wlock(brt); + + for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brtvd = &brt->brt_vdevs[vdevid]; + if (brtvd->bv_initiated) + brt_vdev_dealloc(brt, brtvd); + } + kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); + + brt_unlock(brt); +} + +static void +brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) +{ + + bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); + bre->bre_refcount = 0; + + *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); +} + +static int +brt_entry_compare(const void *x1, const void *x2) +{ + const brt_entry_t *bre1 = x1; + const brt_entry_t *bre2 = x2; + + return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); +} + +static int +brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) +{ + uint64_t mos_entries; + int error; + + ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + + if (!brt_vdev_lookup(brt, brtvd, bre)) + return (SET_ERROR(ENOENT)); + + /* + * Remember mos_entries object number. After we reacquire the BRT lock, + * the brtvd pointer may be invalid. + */ + mos_entries = brtvd->bv_mos_entries; + if (mos_entries == 0) + return (SET_ERROR(ENOENT)); + + brt_unlock(brt); + + error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, + BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); + + brt_wlock(brt); + + return (error); +} + +static void +brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) +{ + brt_vdev_t *brtvd; + uint64_t mos_entries = 0; + + brt_rlock(brt); + brtvd = brt_vdev(brt, vdevid); + if (brtvd != NULL) + mos_entries = brtvd->bv_mos_entries; + brt_unlock(brt); + + if (mos_entries == 0) + return; + + (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, + (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); +} + +/* + * Return TRUE if we _can_ have BRT entry for this bp. It might be false + * positive, but gives us quick answer if we should look into BRT, which + * may require reads and thus will be more expensive. + */ +boolean_t +brt_maybe_exists(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t bre_search; + boolean_t mayexists = FALSE; + uint64_t vdevid; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_rlock(brt); + + brtvd = brt_vdev(brt, vdevid); + if (brtvd != NULL && brtvd->bv_initiated) { + if (!avl_is_empty(&brtvd->bv_tree) || + brt_vdev_lookup(brt, brtvd, &bre_search)) { + mayexists = TRUE; + } + } + + brt_unlock(brt); + + return (mayexists); +} + +uint64_t +brt_get_dspace(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return (0); + + return (brt->brt_savedspace); +} + +uint64_t +brt_get_used(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return (0); + + return (brt->brt_usedspace); +} + +uint64_t +brt_get_saved(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return (0); + + return (brt->brt_savedspace); +} + +uint64_t +brt_get_ratio(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt->brt_usedspace == 0) + return (100); + + return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / + brt->brt_usedspace); +} + +static int +brt_kstats_update(kstat_t *ksp, int rw) +{ + brt_stats_t *bs = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + bs->brt_addref_entry_in_memory.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_in_memory); + bs->brt_addref_entry_not_on_disk.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); + bs->brt_addref_entry_on_disk.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_on_disk); + bs->brt_addref_entry_read_lost_race.value.ui64 = + wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); + bs->brt_decref_entry_in_memory.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_in_memory); + bs->brt_decref_entry_loaded_from_disk.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); + bs->brt_decref_entry_not_in_memory.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); + bs->brt_decref_entry_not_on_disk.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); + bs->brt_decref_entry_read_lost_race.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); + bs->brt_decref_entry_still_referenced.value.ui64 = + wmsum_value(&brt_sums.brt_decref_entry_still_referenced); + bs->brt_decref_free_data_later.value.ui64 = + wmsum_value(&brt_sums.brt_decref_free_data_later); + bs->brt_decref_free_data_now.value.ui64 = + wmsum_value(&brt_sums.brt_decref_free_data_now); + bs->brt_decref_no_entry.value.ui64 = + wmsum_value(&brt_sums.brt_decref_no_entry); + + return (0); +} + +static void +brt_stat_init(void) +{ + + wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); + wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); + wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); + wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); + wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); + wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); + wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); + wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); + wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); + wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); + wmsum_init(&brt_sums.brt_decref_free_data_later, 0); + wmsum_init(&brt_sums.brt_decref_free_data_now, 0); + wmsum_init(&brt_sums.brt_decref_no_entry, 0); + + brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, + sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (brt_ksp != NULL) { + brt_ksp->ks_data = &brt_stats; + brt_ksp->ks_update = brt_kstats_update; + kstat_install(brt_ksp); + } +} + +static void +brt_stat_fini(void) +{ + if (brt_ksp != NULL) { + kstat_delete(brt_ksp); + brt_ksp = NULL; + } + + wmsum_fini(&brt_sums.brt_addref_entry_in_memory); + wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); + wmsum_fini(&brt_sums.brt_addref_entry_on_disk); + wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); + wmsum_fini(&brt_sums.brt_decref_entry_in_memory); + wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); + wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); + wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); + wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); + wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); + wmsum_fini(&brt_sums.brt_decref_free_data_later); + wmsum_fini(&brt_sums.brt_decref_free_data_now); + wmsum_fini(&brt_sums.brt_decref_no_entry); +} + +void +brt_init(void) +{ + brt_entry_cache = kmem_cache_create("brt_entry_cache", + sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", + sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + brt_stat_init(); +} + +void +brt_fini(void) +{ + brt_stat_fini(); + + kmem_cache_destroy(brt_entry_cache); + kmem_cache_destroy(brt_pending_entry_cache); +} + +static brt_entry_t * +brt_entry_alloc(const brt_entry_t *bre_init) +{ + brt_entry_t *bre; + + bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); + bre->bre_offset = bre_init->bre_offset; + bre->bre_refcount = bre_init->bre_refcount; + + return (bre); +} + +static void +brt_entry_free(brt_entry_t *bre) +{ + + kmem_cache_free(brt_entry_cache, bre); +} + +static void +brt_entry_addref(brt_t *brt, const blkptr_t *bp) +{ + brt_vdev_t *brtvd; + brt_entry_t *bre, *racebre; + brt_entry_t bre_search; + avl_index_t where; + uint64_t vdevid; + int error; + + ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_wlock(brt); + + brtvd = brt_vdev(brt, vdevid); + if (brtvd == NULL) { + ASSERT3U(vdevid, >=, brt->brt_nvdevs); + + /* New VDEV was added. */ + brt_vdevs_expand(brt, vdevid + 1); + brtvd = brt_vdev(brt, vdevid); + } + ASSERT(brtvd != NULL); + if (!brtvd->bv_initiated) + brt_vdev_realloc(brt, brtvd); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre != NULL) { + BRTSTAT_BUMP(brt_addref_entry_in_memory); + } else { + /* + * brt_entry_lookup() may drop the BRT (read) lock and + * reacquire it (write). + */ + error = brt_entry_lookup(brt, brtvd, &bre_search); + /* bre_search now contains correct bre_refcount */ + ASSERT(error == 0 || error == ENOENT); + if (error == 0) + BRTSTAT_BUMP(brt_addref_entry_on_disk); + else + BRTSTAT_BUMP(brt_addref_entry_not_on_disk); + /* + * When the BRT lock was dropped, brt_vdevs[] may have been + * expanded and reallocated, we need to update brtvd's pointer. + */ + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); + if (racebre == NULL) { + bre = brt_entry_alloc(&bre_search); + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + avl_insert(&brtvd->bv_tree, bre, where); + brt->brt_nentries++; + } else { + /* + * The entry was added when the BRT lock was dropped in + * brt_entry_lookup(). + */ + BRTSTAT_BUMP(brt_addref_entry_read_lost_race); + bre = racebre; + } + } + bre->bre_refcount++; + brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + + brt_unlock(brt); +} + +/* Return TRUE if block should be freed immediately. */ +boolean_t +brt_entry_decref(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t *bre, *racebre; + brt_entry_t bre_search; + avl_index_t where; + uint64_t vdevid; + int error; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_wlock(brt); + + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre != NULL) { + BRTSTAT_BUMP(brt_decref_entry_in_memory); + goto out; + } else { + BRTSTAT_BUMP(brt_decref_entry_not_in_memory); + } + + /* + * brt_entry_lookup() may drop the BRT lock and reacquire it. + */ + error = brt_entry_lookup(brt, brtvd, &bre_search); + /* bre_search now contains correct bre_refcount */ + ASSERT(error == 0 || error == ENOENT); + /* + * When the BRT lock was dropped, brt_vdevs[] may have been expanded + * and reallocated, we need to update brtvd's pointer. + */ + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + if (error == ENOENT) { + BRTSTAT_BUMP(brt_decref_entry_not_on_disk); + bre = NULL; + goto out; + } + + racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); + if (racebre != NULL) { + /* + * The entry was added when the BRT lock was dropped in + * brt_entry_lookup(). + */ + BRTSTAT_BUMP(brt_decref_entry_read_lost_race); + bre = racebre; + goto out; + } + + BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); + bre = brt_entry_alloc(&bre_search); + ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + avl_insert(&brtvd->bv_tree, bre, where); + brt->brt_nentries++; + +out: + if (bre == NULL) { + /* + * This is a free of a regular (not cloned) block. + */ + brt_unlock(brt); + BRTSTAT_BUMP(brt_decref_no_entry); + return (B_TRUE); + } + if (bre->bre_refcount == 0) { + brt_unlock(brt); + BRTSTAT_BUMP(brt_decref_free_data_now); + return (B_TRUE); + } + + ASSERT(bre->bre_refcount > 0); + bre->bre_refcount--; + if (bre->bre_refcount == 0) + BRTSTAT_BUMP(brt_decref_free_data_later); + else + BRTSTAT_BUMP(brt_decref_entry_still_referenced); + brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + + brt_unlock(brt); + + return (B_FALSE); +} + +uint64_t +brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t bre_search, *bre; + uint64_t vdevid, refcnt; + int error; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_rlock(brt); + + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre == NULL) { + error = brt_entry_lookup(brt, brtvd, &bre_search); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) + refcnt = 0; + else + refcnt = bre_search.bre_refcount; + } else + refcnt = bre->bre_refcount; + + brt_unlock(brt); + return (refcnt); +} + +static void +brt_prefetch(brt_t *brt, const blkptr_t *bp) +{ + brt_entry_t bre; + uint64_t vdevid; + + ASSERT(bp != NULL); + + if (!brt_zap_prefetch) + return; + + brt_entry_fill(bp, &bre, &vdevid); + + brt_entry_prefetch(brt, vdevid, &bre); +} + +static int +brt_pending_entry_compare(const void *x1, const void *x2) +{ + const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; + const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; + int cmp; + + cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), + DVA_GET_VDEV(&bp2->blk_dva[0])); + if (cmp == 0) { + cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), + DVA_GET_OFFSET(&bp2->blk_dva[0])); + if (unlikely(cmp == 0)) { + cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); + } + } + + return (cmp); +} + +void +brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) +{ + brt_t *brt; + avl_tree_t *pending_tree; + kmutex_t *pending_lock; + brt_pending_entry_t *bpe, *newbpe; + avl_index_t where; + uint64_t txg; + + brt = spa->spa_brt; + txg = dmu_tx_get_txg(tx); + ASSERT3U(txg, !=, 0); + pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; + + newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); + newbpe->bpe_bp = *bp; + newbpe->bpe_count = 1; + + mutex_enter(pending_lock); + + bpe = avl_find(pending_tree, newbpe, &where); + if (bpe == NULL) { + avl_insert(pending_tree, newbpe, where); + newbpe = NULL; + } else { + bpe->bpe_count++; + } + + mutex_exit(pending_lock); + + if (newbpe != NULL) { + ASSERT(bpe != NULL); + ASSERT(bpe != newbpe); + kmem_cache_free(brt_pending_entry_cache, newbpe); + } else { + ASSERT(bpe == NULL); + + /* Prefetch BRT entry for the syncing context. */ + brt_prefetch(brt, bp); + } +} + +void +brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) +{ + brt_t *brt; + avl_tree_t *pending_tree; + kmutex_t *pending_lock; + brt_pending_entry_t *bpe, bpe_search; + uint64_t txg; + + brt = spa->spa_brt; + txg = dmu_tx_get_txg(tx); + ASSERT3U(txg, !=, 0); + pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; + + bpe_search.bpe_bp = *bp; + + mutex_enter(pending_lock); + + bpe = avl_find(pending_tree, &bpe_search, NULL); + /* I believe we should always find bpe when this function is called. */ + if (bpe != NULL) { + ASSERT(bpe->bpe_count > 0); + + bpe->bpe_count--; + if (bpe->bpe_count == 0) { + avl_remove(pending_tree, bpe); + kmem_cache_free(brt_pending_entry_cache, bpe); + } + } + + mutex_exit(pending_lock); +} + +void +brt_pending_apply(spa_t *spa, uint64_t txg) +{ + brt_t *brt = spa->spa_brt; + brt_pending_entry_t *bpe; + avl_tree_t *pending_tree; + void *c; + + ASSERT3U(txg, !=, 0); + + /* + * We are in syncing context, so no other brt_pending_tree accesses + * are possible for the TXG. Don't need to acquire brt_pending_lock. + */ + pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + + c = NULL; + while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { + boolean_t added_to_ddt; + + for (int i = 0; i < bpe->bpe_count; i++) { + /* + * If the block has DEDUP bit set, it means that it + * already exists in the DEDUP table, so we can just + * use that instead of creating new entry in + * the BRT table. + */ + if (BP_GET_DEDUP(&bpe->bpe_bp)) { + added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); + } else { + added_to_ddt = B_FALSE; + } + if (!added_to_ddt) + brt_entry_addref(brt, &bpe->bpe_bp); + } + + kmem_cache_free(brt_pending_entry_cache, bpe); + } +} + +static void +brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) +{ + if (bre->bre_refcount == 0) { + int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, tx); + VERIFY(error == 0 || error == ENOENT); + } else { + VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), + &bre->bre_refcount, tx)); + } +} + +static void +brt_sync_table(brt_t *brt, dmu_tx_t *tx) +{ + brt_vdev_t *brtvd; + brt_entry_t *bre; + dnode_t *dn; + uint64_t vdevid; + void *c; + + brt_wlock(brt); + + for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brtvd = &brt->brt_vdevs[vdevid]; + + if (!brtvd->bv_initiated) + continue; + + if (!brtvd->bv_meta_dirty) { + ASSERT(!brtvd->bv_entcount_dirty); + ASSERT0(avl_numnodes(&brtvd->bv_tree)); + continue; + } + + ASSERT(!brtvd->bv_entcount_dirty || + avl_numnodes(&brtvd->bv_tree) != 0); + + if (brtvd->bv_mos_brtvdev == 0) + brt_vdev_create(brt, brtvd, tx); + + VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, + FTAG, &dn)); + + c = NULL; + while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { + brt_sync_entry(dn, bre, tx); + brt_entry_free(bre); + ASSERT(brt->brt_nentries > 0); + brt->brt_nentries--; + } + + dnode_rele(dn, FTAG); + + brt_vdev_sync(brt, brtvd, tx); + + if (brtvd->bv_totalcount == 0) + brt_vdev_destroy(brt, brtvd, tx); + } + + ASSERT0(brt->brt_nentries); + + brt_unlock(brt); +} + +void +brt_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + brt_t *brt; + + ASSERT(spa_syncing_txg(spa) == txg); + + brt = spa->spa_brt; + brt_rlock(brt); + if (brt->brt_nentries == 0) { + /* No changes. */ + brt_unlock(brt); + return; + } + brt_unlock(brt); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + brt_sync_table(brt, tx); + + dmu_tx_commit(tx); +} + +static void +brt_table_alloc(brt_t *brt) +{ + + for (int i = 0; i < TXG_SIZE; i++) { + avl_create(&brt->brt_pending_tree[i], + brt_pending_entry_compare, + sizeof (brt_pending_entry_t), + offsetof(brt_pending_entry_t, bpe_node)); + mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, + NULL); + } +} + +static void +brt_table_free(brt_t *brt) +{ + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); + + avl_destroy(&brt->brt_pending_tree[i]); + mutex_destroy(&brt->brt_pending_lock[i]); + } +} + +static void +brt_alloc(spa_t *spa) +{ + brt_t *brt; + + ASSERT(spa->spa_brt == NULL); + + brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); + rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); + brt->brt_spa = spa; + brt->brt_rangesize = 0; + brt->brt_nentries = 0; + brt->brt_vdevs = NULL; + brt->brt_nvdevs = 0; + brt_table_alloc(brt); + + spa->spa_brt = brt; +} + +void +brt_create(spa_t *spa) +{ + + brt_alloc(spa); + brt_vdevs_alloc(spa->spa_brt, B_FALSE); +} + +int +brt_load(spa_t *spa) +{ + + brt_alloc(spa); + brt_vdevs_alloc(spa->spa_brt, B_TRUE); + + return (0); +} + +void +brt_unload(spa_t *spa) +{ + brt_t *brt = spa->spa_brt; + + if (brt == NULL) + return; + + brt_vdevs_free(brt); + brt_table_free(brt); + rw_destroy(&brt->brt_lock); + kmem_free(brt, sizeof (*brt)); + spa->spa_brt = NULL; +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, + "Enable prefetching of BRT ZAP entries"); +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, + "BRT ZAP leaf blockshift"); +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, + "BRT ZAP indirect blockshift"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c index 57b9dbbb2b50..9c52083603f1 100644 --- a/sys/contrib/openzfs/module/zfs/btree.c +++ b/sys/contrib/openzfs/module/zfs/btree.c @@ -53,18 +53,30 @@ kmem_cache_t *zfs_btree_leaf_cache; * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ -int zfs_btree_verify_intensity = 0; +uint_t zfs_btree_verify_intensity = 0; /* - * A convenience function to silence warnings from memmove's return value and - * change argument order to src, dest. + * Convenience functions to silence warnings from memcpy/memmove's + * return values and change argument order to src, dest. */ static void +bcpy(const void *src, void *dest, size_t size) +{ + (void) memcpy(dest, src, size); +} + +static void bmov(const void *src, void *dest, size_t size) { (void) memmove(dest, src, size); } +static boolean_t +zfs_btree_is_core(struct zfs_btree_hdr *hdr) +{ + return (hdr->bth_first == -1); +} + #ifdef _ILP32 #define BTREE_POISON 0xabadb10c #else @@ -76,59 +88,74 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; - if (!hdr->bth_core) { - zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; - (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, - BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - - hdr->bth_count * size); - } else { + if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; - for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; + i++) { node->btc_children[i] = (zfs_btree_hdr_t *)BTREE_POISON; } (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, (BTREE_CORE_ELEMS - hdr->bth_count) * size); + } else { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); + (void) memset(leaf->btl_elems + + (hdr->bth_first + hdr->bth_count) * size, 0x0f, + tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - + (hdr->bth_first + hdr->bth_count) * size); } #endif } static inline void zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, - uint64_t offset) + uint32_t idx, uint32_t count) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; - ASSERT3U(offset, >=, hdr->bth_count); - if (!hdr->bth_core) { - zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; - (void) memset(leaf->btl_elems + offset * size, 0x0f, size); - } else { + if (zfs_btree_is_core(hdr)) { + ASSERT3U(idx, >=, hdr->bth_count); + ASSERT3U(idx, <=, BTREE_CORE_ELEMS); + ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; - node->btc_children[offset + 1] = - (zfs_btree_hdr_t *)BTREE_POISON; - (void) memset(node->btc_elems + offset * size, 0x0f, size); + for (uint32_t i = 1; i <= count; i++) { + node->btc_children[idx + i] = + (zfs_btree_hdr_t *)BTREE_POISON; + } + (void) memset(node->btc_elems + idx * size, 0x0f, count * size); + } else { + ASSERT3U(idx, <=, tree->bt_leaf_cap); + ASSERT3U(idx + count, <=, tree->bt_leaf_cap); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + + (hdr->bth_first + idx) * size, 0x0f, count * size); } #endif } static inline void zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, - uint64_t offset) + uint32_t idx) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; - uint8_t eval = 0x0f; - if (hdr->bth_core) { + if (zfs_btree_is_core(hdr)) { + ASSERT3U(idx, <, BTREE_CORE_ELEMS); zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; - VERIFY3P(node->btc_children[offset + 1], ==, cval); - for (int i = 0; i < size; i++) - VERIFY3U(node->btc_elems[offset * size + i], ==, eval); + VERIFY3P(node->btc_children[idx + 1], ==, cval); + for (size_t i = 0; i < size; i++) + VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); } else { + ASSERT3U(idx, <, tree->bt_leaf_cap); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; - for (int i = 0; i < size; i++) - VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); + if (idx >= tree->bt_leaf_cap - hdr->bth_first) + return; + for (size_t i = 0; i < size; i++) { + VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) + * size + i], ==, 0x0f); + } } #endif } @@ -137,8 +164,7 @@ void zfs_btree_init(void) { zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", - BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, - NULL, 0); + BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void @@ -147,21 +173,52 @@ zfs_btree_fini(void) kmem_cache_destroy(zfs_btree_leaf_cache); } +static void * +zfs_btree_leaf_alloc(zfs_btree_t *tree) +{ + if (tree->bt_leaf_size == BTREE_LEAF_SIZE) + return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); + else + return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); +} + +static void +zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) +{ + if (tree->bt_leaf_size == BTREE_LEAF_SIZE) + return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); + else + return (kmem_free(ptr, tree->bt_leaf_size)); +} + void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), - size_t size) + bt_find_in_buf_f bt_find_in_buf, size_t size) { - /* - * We need a minimmum of 4 elements so that when we split a node we - * always have at least two elements in each node. This simplifies the - * logic in zfs_btree_bulk_finish, since it means the last leaf will - * always have a left sibling to share with (unless it's the root). - */ - ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4); + zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, + BTREE_LEAF_SIZE); +} + +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, + const void *value, zfs_btree_index_t *where); + +void +zfs_btree_create_custom(zfs_btree_t *tree, + int (*compar) (const void *, const void *), + bt_find_in_buf_f bt_find_in_buf, + size_t size, size_t lsize) +{ + size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); - bzero(tree, sizeof (*tree)); + ASSERT3U(size, <=, esize / 2); + memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; + tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? + zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; + tree->bt_leaf_size = lsize; + tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t); tree->bt_height = -1; tree->bt_bulk = NULL; } @@ -170,21 +227,20 @@ zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), * Find value in the array of elements provided. Uses a simple binary search. */ static void * -zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems, +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, const void *value, zfs_btree_index_t *where) { - uint64_t max = nelems; - uint64_t min = 0; + uint32_t max = nelems; + uint32_t min = 0; while (max > min) { - uint64_t idx = (min + max) / 2; + uint32_t idx = (min + max) / 2; uint8_t *cur = buf + idx * tree->bt_elem_size; int comp = tree->bt_compar(cur, value); - if (comp == -1) { + if (comp < 0) { min = idx + 1; - } else if (comp == 1) { + } else if (comp > 0) { max = idx; } else { - ASSERT0(comp); where->bti_offset = idx; where->bti_before = B_FALSE; return (cur); @@ -219,12 +275,13 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) * bulk-insert mode are to insert new elements. */ zfs_btree_index_t idx; + size_t size = tree->bt_elem_size; if (tree->bt_bulk != NULL) { zfs_btree_leaf_t *last_leaf = tree->bt_bulk; - int compar = tree->bt_compar(last_leaf->btl_elems + - ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), - value); - if (compar < 0) { + int comp = tree->bt_compar(last_leaf->btl_elems + + (last_leaf->btl_hdr.bth_first + + last_leaf->btl_hdr.bth_count - 1) * size, value); + if (comp < 0) { /* * If what they're looking for is after the last * element, it's not in the tree. @@ -236,7 +293,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) where->bti_before = B_TRUE; } return (NULL); - } else if (compar == 0) { + } else if (comp == 0) { if (where != NULL) { where->bti_node = (zfs_btree_hdr_t *)last_leaf; where->bti_offset = @@ -244,18 +301,20 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) where->bti_before = B_FALSE; } return (last_leaf->btl_elems + - ((last_leaf->btl_hdr.bth_count - 1) * - tree->bt_elem_size)); + (last_leaf->btl_hdr.bth_first + + last_leaf->btl_hdr.bth_count - 1) * size); } - if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) { + if (tree->bt_compar(last_leaf->btl_elems + + last_leaf->btl_hdr.bth_first * size, value) <= 0) { /* * If what they're looking for is after the first * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = zfs_btree_find_in_buf(tree, - last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, - value, &idx); + void *d = tree->bt_find_in_buf(tree, + last_leaf->btl_elems + + last_leaf->btl_hdr.bth_first * size, + last_leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)last_leaf; @@ -266,8 +325,8 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) } zfs_btree_core_t *node = NULL; - uint64_t child = 0; - uint64_t depth = 0; + uint32_t child = 0; + uint32_t depth = 0; /* * Iterate down the tree, finding which child the value should be in @@ -276,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); - void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + void *d = tree->bt_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { @@ -296,7 +355,8 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); - void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, + void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { @@ -366,24 +426,23 @@ enum bt_shift_direction { * shift is determined by shape. The direction is determined by dir. */ static inline void -bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, - uint64_t count, uint64_t off, enum bt_shift_shape shape, +bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, + uint32_t count, uint32_t off, enum bt_shift_shape shape, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; - ASSERT(node->btc_hdr.bth_core); + ASSERT(zfs_btree_is_core(&node->btc_hdr)); uint8_t *e_start = node->btc_elems + idx * size; - int sign = (dir == BSD_LEFT ? -1 : +1); - uint8_t *e_out = e_start + sign * off * size; - uint64_t e_count = count; - bmov(e_start, e_out, e_count * size); + uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : + e_start + off * size); + bmov(e_start, e_out, count * size); zfs_btree_hdr_t **c_start = node->btc_children + idx + (shape == BSS_TRAPEZOID ? 0 : 1); zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); - uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bmov(c_start, c_out, c_count * sizeof (*c_start)); } @@ -394,8 +453,8 @@ bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, * false if it is a parallelogram. */ static inline void -bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, - uint64_t count, enum bt_shift_shape shape) +bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, + uint32_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); } @@ -405,8 +464,8 @@ bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, * Starts with elements[idx] and children[idx] and one more child than element. */ static inline void -bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, - uint64_t count, enum bt_shift_shape shape) +bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, + uint32_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); } @@ -417,30 +476,78 @@ bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, * is determined by left. */ static inline void -bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, - uint64_t count, uint64_t off, enum bt_shift_direction dir) +bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, + uint32_t count, uint32_t off, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; - ASSERT(!node->btl_hdr.bth_core); + zfs_btree_hdr_t *hdr = &node->btl_hdr; + ASSERT(!zfs_btree_is_core(hdr)); - uint8_t *start = node->btl_elems + idx * size; - int sign = (dir == BSD_LEFT ? -1 : +1); - uint8_t *out = start + sign * off * size; + if (count == 0) + return; + uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; + uint8_t *out = (dir == BSD_LEFT ? start - off * size : + start + off * size); bmov(start, out, count * size); } -static inline void -bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, - uint64_t count) +/* + * Grow leaf for n new elements before idx. + */ +static void +bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, + uint32_t n) { - bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + ASSERT(!zfs_btree_is_core(hdr)); + ASSERT3U(idx, <=, hdr->bth_count); + uint32_t capacity = tree->bt_leaf_cap; + ASSERT3U(hdr->bth_count + n, <=, capacity); + boolean_t cl = (hdr->bth_first >= n); + boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); + + if (cl && (!cr || idx <= hdr->bth_count / 2)) { + /* Grow left. */ + hdr->bth_first -= n; + bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); + } else if (cr) { + /* Grow right. */ + bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, + BSD_RIGHT); + } else { + /* Grow both ways. */ + uint32_t fn = hdr->bth_first - + (capacity - (hdr->bth_count + n)) / 2; + hdr->bth_first -= fn; + bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); + bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, + n - fn, BSD_RIGHT); + } + hdr->bth_count += n; } -static inline void -bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, - uint64_t count) +/* + * Shrink leaf for count elements starting from idx. + */ +static void +bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, + uint32_t n) { - bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + ASSERT(!zfs_btree_is_core(hdr)); + ASSERT3U(idx, <=, hdr->bth_count); + ASSERT3U(idx + n, <=, hdr->bth_count); + + if (idx <= (hdr->bth_count - n) / 2) { + bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); + zfs_btree_poison_node_at(tree, hdr, 0, n); + hdr->bth_first += n; + } else { + bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, + BSD_LEFT); + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); + } + hdr->bth_count -= n; } /* @@ -448,32 +555,33 @@ bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, * parameter behaves the same as it does in the shift logic. */ static inline void -bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, - uint64_t count, zfs_btree_core_t *dest, uint64_t didx, +bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, + uint32_t count, zfs_btree_core_t *dest, uint32_t didx, enum bt_shift_shape shape) { size_t size = tree->bt_elem_size; - ASSERT(source->btc_hdr.bth_core); - ASSERT(dest->btc_hdr.bth_core); + ASSERT(zfs_btree_is_core(&source->btc_hdr)); + ASSERT(zfs_btree_is_core(&dest->btc_hdr)); - bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size, + bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, count * size); - uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); - bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), + uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), c_count * sizeof (*source->btc_children)); } static inline void -bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, - uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx) +bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, + uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) { size_t size = tree->bt_elem_size; - ASSERT(!source->btl_hdr.bth_core); - ASSERT(!dest->btl_hdr.bth_core); + ASSERT(!zfs_btree_is_core(&source->btl_hdr)); + ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); - bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size, + bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, + dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, count * size); } @@ -482,30 +590,31 @@ bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, * put its location in where if non-null. */ static void * -zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) +zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + zfs_btree_index_t *where) { zfs_btree_hdr_t *node; - for (node = hdr; node->bth_core; node = - ((zfs_btree_core_t *)node)->btc_children[0]) + for (node = hdr; zfs_btree_is_core(node); + node = ((zfs_btree_core_t *)node)->btc_children[0]) ; - ASSERT(!node->bth_core); + ASSERT(!zfs_btree_is_core(node)); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; if (where != NULL) { where->bti_node = node; where->bti_offset = 0; where->bti_before = B_FALSE; } - return (&leaf->btl_elems[0]); + return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); } /* Insert an element and a child into a core node at the given offset. */ static void zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, - uint64_t offset, zfs_btree_hdr_t *new_node, void *buf) + uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) { - uint64_t size = tree->bt_elem_size; + size_t size = tree->bt_elem_size; zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; ASSERT3P(par_hdr, ==, new_node->bth_parent); ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); @@ -515,13 +624,13 @@ zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, par_hdr->bth_count); } /* Shift existing elements and children */ - uint64_t count = par_hdr->bth_count - offset; + uint32_t count = par_hdr->bth_count - offset; bt_shift_core_right(tree, parent, offset, count, BSS_PARALLELOGRAM); /* Insert new values */ parent->btc_children[offset + 1] = new_node; - bmov(buf, parent->btc_elems + offset * size, size); + bcpy(buf, parent->btc_elems + offset * size, size); par_hdr->bth_count++; } @@ -534,9 +643,8 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, zfs_btree_hdr_t *new_node, void *buf) { ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); - uint64_t size = tree->bt_elem_size; + size_t size = tree->bt_elem_size; zfs_btree_core_t *parent = old_node->bth_parent; - zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; /* * If this is the root node we were splitting, we create a new root @@ -550,13 +658,13 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, size, KM_SLEEP); zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; new_root_hdr->bth_parent = NULL; - new_root_hdr->bth_core = B_TRUE; + new_root_hdr->bth_first = -1; new_root_hdr->bth_count = 1; old_node->bth_parent = new_node->bth_parent = new_root; new_root->btc_children[0] = old_node; new_root->btc_children[1] = new_node; - bmov(buf, new_root->btc_elems, size); + bcpy(buf, new_root->btc_elems, size); tree->bt_height++; tree->bt_root = new_root_hdr; @@ -568,12 +676,13 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, * Since we have the new separator, binary search for where to put * new_node. */ + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; zfs_btree_index_t idx; - ASSERT(par_hdr->bth_core); - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + ASSERT(zfs_btree_is_core(par_hdr)); + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); - uint64_t offset = idx.bti_offset; + uint32_t offset = idx.bti_offset; ASSERT3U(offset, <=, par_hdr->bth_count); ASSERT3P(parent->btc_children[offset], ==, old_node); @@ -604,16 +713,16 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, * We do this in two stages: first we split into two nodes, and then we * reuse our existing logic to insert the new element and child. */ - uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? + uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 2 : 4)) - 1, 2); - uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1; + uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); tree->bt_num_nodes++; zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * size, KM_SLEEP); zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; new_par_hdr->bth_parent = par_hdr->bth_parent; - new_par_hdr->bth_core = B_TRUE; + new_par_hdr->bth_first = -1; new_par_hdr->bth_count = move_count; zfs_btree_poison_node(tree, new_par_hdr); @@ -624,7 +733,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); - bmov(parent->btc_elems + keep_count * size, tmp_buf, + bcpy(parent->btc_elems + keep_count * size, tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); @@ -636,7 +745,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, /* * Move the new separator to the existing buffer. */ - bmov(tmp_buf, buf, size); + bcpy(tmp_buf, buf, size); } else if (offset > keep_count) { /* Insert the new node into the right half */ new_node->bth_parent = new_parent; @@ -646,7 +755,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, /* * Move the new separator to the existing buffer. */ - bmov(tmp_buf, buf, size); + bcpy(tmp_buf, buf, size); } else { /* * Move the new separator into the right half, and replace it @@ -656,16 +765,16 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, bt_shift_core_right(tree, new_parent, 0, move_count, BSS_TRAPEZOID); new_parent->btc_children[0] = new_node; - bmov(tmp_buf, new_parent->btc_elems, size); + bcpy(tmp_buf, new_parent->btc_elems, size); new_par_hdr->bth_count++; } kmem_free(tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); - for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) + for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) new_parent->btc_children[i]->bth_parent = new_parent; - for (int i = 0; i <= parent->btc_hdr.bth_count; i++) + for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); /* @@ -679,34 +788,32 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, /* Insert an element into a leaf node at the given offset. */ static void zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, - uint64_t idx, const void *value) + uint32_t idx, const void *value) { - uint64_t size = tree->bt_elem_size; - uint8_t *start = leaf->btl_elems + (idx * size); + size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; - uint64_t capacity __maybe_unused = P2ALIGN((BTREE_LEAF_SIZE - - sizeof (zfs_btree_hdr_t)) / size, 2); - uint64_t count = leaf->btl_hdr.bth_count - idx; - ASSERT3U(leaf->btl_hdr.bth_count, <, capacity); + ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); if (zfs_btree_verify_intensity >= 5) { zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, leaf->btl_hdr.bth_count); } - bt_shift_leaf_right(tree, leaf, idx, count); - bmov(value, start, size); - hdr->bth_count++; + bt_grow_leaf(tree, leaf, idx, 1); + uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; + bcpy(value, start, size); } +static void +zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); + /* Helper function for inserting a new value into leaf at the given index. */ static void zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, - const void *value, uint64_t idx) + const void *value, uint32_t idx) { - uint64_t size = tree->bt_elem_size; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - - sizeof (zfs_btree_hdr_t)) / size, 2); + size_t size = tree->bt_elem_size; + uint32_t capacity = tree->bt_leaf_cap; /* * If the leaf isn't full, shift the elements after idx and insert @@ -731,32 +838,35 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, * In either case, we're left with one extra element. The leftover * element will become the new dividing element between the two nodes. */ - uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) - - 1, 2); - uint64_t keep_count = capacity - move_count - 1; - ASSERT3U(capacity - move_count, >=, 2); + uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; + uint32_t keep_count = capacity - move_count - 1; + ASSERT3U(keep_count, >=, 1); + /* If we insert on left. move one more to keep leaves balanced. */ + if (idx < keep_count) { + keep_count--; + move_count++; + } tree->bt_num_nodes++; - zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, - KM_SLEEP); + zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; new_hdr->bth_parent = leaf->btl_hdr.bth_parent; - new_hdr->bth_core = B_FALSE; + new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + + (idx >= keep_count && idx <= keep_count + move_count / 2); new_hdr->bth_count = move_count; zfs_btree_poison_node(tree, new_hdr); - leaf->btl_hdr.bth_count = keep_count; - if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) tree->bt_bulk = new_leaf; /* Copy the back part to the new leaf. */ - bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, - 0); + bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); /* We store the new separator in a buffer we control for simplicity. */ uint8_t *buf = kmem_alloc(size, KM_SLEEP); - bmov(leaf->btl_elems + (keep_count * size), buf, size); - zfs_btree_poison_node(tree, &leaf->btl_hdr); + bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, + buf, size); + + bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); if (idx < keep_count) { /* Insert into the existing leaf. */ @@ -767,13 +877,11 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, 1, value); } else { /* - * Shift the elements in the new leaf to make room for the - * separator, and use the new value as the new separator. + * Insert planned separator into the new leaf, and use + * the new value as the new separator. */ - bt_shift_leaf_right(tree, new_leaf, 0, move_count); - bmov(buf, new_leaf->btl_elems, size); - bmov(value, buf, size); - new_hdr->bth_count++; + zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); + bcpy(value, buf, size); } /* @@ -785,18 +893,19 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, kmem_free(buf, size); } -static uint64_t +static uint32_t zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { void *buf; - if (hdr->bth_core) { + if (zfs_btree_is_core(hdr)) { buf = ((zfs_btree_core_t *)hdr)->btc_elems; } else { - buf = ((zfs_btree_leaf_t *)hdr)->btl_elems; + buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + + hdr->bth_first * tree->bt_elem_size; } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); @@ -821,9 +930,8 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) zfs_btree_leaf_t *leaf = tree->bt_bulk; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; zfs_btree_core_t *parent = hdr->bth_parent; - uint64_t size = tree->bt_elem_size; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - - sizeof (zfs_btree_hdr_t)) / size, 2); + size_t size = tree->bt_elem_size; + uint32_t capacity = tree->bt_leaf_cap; /* * The invariant doesn't apply to the root node, if that's the only @@ -848,56 +956,54 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) .bti_offset = 0 }; VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); - ASSERT(idx.bti_node->bth_core); + ASSERT(zfs_btree_is_core(idx.bti_node)); zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; - uint64_t common_idx = idx.bti_offset; + uint32_t common_idx = idx.bti_offset; VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); - ASSERT(!idx.bti_node->bth_core); + ASSERT(!zfs_btree_is_core(idx.bti_node)); zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; zfs_btree_hdr_t *l_hdr = idx.bti_node; - uint64_t move_count = (capacity / 2) - hdr->bth_count; + uint32_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, capacity / 2); if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < move_count; i++) { + for (uint32_t i = 0; i < move_count; i++) { zfs_btree_verify_poison_at(tree, hdr, leaf->btl_hdr.bth_count + i); } } /* First, shift elements in leaf back. */ - bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count, - BSD_RIGHT); + bt_grow_leaf(tree, leaf, 0, move_count); /* Next, move the separator from the common ancestor to leaf. */ - uint8_t *separator = common->btc_elems + (common_idx * size); - uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); - bmov(separator, out, size); - move_count--; + uint8_t *separator = common->btc_elems + common_idx * size; + uint8_t *out = leaf->btl_elems + + (hdr->bth_first + move_count - 1) * size; + bcpy(separator, out, size); /* * Now we move elements from the tail of the left neighbor to * fill the remaining spots in leaf. */ bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - - move_count, move_count, leaf, 0); + (move_count - 1), move_count - 1, leaf, 0); /* * Finally, move the new last element in the left neighbor to * the separator. */ - bmov(l_neighbor->btl_elems + (l_hdr->bth_count - - move_count - 1) * size, separator, size); + bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + + l_hdr->bth_count - move_count) * size, separator, size); /* Adjust the node's counts, and we're done. */ - l_hdr->bth_count -= move_count + 1; - hdr->bth_count += move_count + 1; + bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, + move_count); ASSERT3U(l_hdr->bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); - zfs_btree_poison_node(tree, l_hdr); } /* @@ -921,16 +1027,16 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) * splitting is 2, we never need to worry about not having a * left sibling (a sibling is a neighbor with the same parent). */ - uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); ASSERT3U(parent_idx, >, 0); zfs_btree_core_t *l_neighbor = (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; - uint64_t move_count = (capacity / 2) - hdr->bth_count; + uint32_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, capacity / 2); if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < move_count; i++) { + for (uint32_t i = 0; i < move_count; i++) { zfs_btree_verify_poison_at(tree, hdr, hdr->bth_count + i); } @@ -943,14 +1049,14 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * size); uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); - bmov(separator, e_out, size); + bcpy(separator, e_out, size); /* * Now, move elements and children from the left node to the * right. We move one more child than elements. */ move_count--; - uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; + uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, BSS_TRAPEZOID); @@ -959,7 +1065,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) * separator's position. */ move_idx--; - bmov(l_neighbor->btc_elems + move_idx * size, separator, size); + bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); l_neighbor->btc_hdr.bth_count -= move_count + 1; hdr->bth_count += move_count + 1; @@ -969,11 +1075,12 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); - for (int i = 0; i <= hdr->bth_count; i++) + for (uint32_t i = 0; i <= hdr->bth_count; i++) cur->btc_children[i]->bth_parent = cur; } tree->bt_bulk = NULL; + zfs_btree_verify(tree); } /* @@ -1006,20 +1113,19 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value, ASSERT0(where->bti_offset); tree->bt_num_nodes++; - zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, - KM_SLEEP); + zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); tree->bt_root = &leaf->btl_hdr; tree->bt_height++; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; hdr->bth_parent = NULL; - hdr->bth_core = B_FALSE; + hdr->bth_first = 0; hdr->bth_count = 0; zfs_btree_poison_node(tree, hdr); zfs_btree_insert_into_leaf(tree, leaf, value, 0); tree->bt_bulk = leaf; - } else if (!where->bti_node->bth_core) { + } else if (!zfs_btree_is_core(where->bti_node)) { /* * If we're inserting into a leaf, go directly to the helper * function. @@ -1035,28 +1141,28 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value, * value in the node at that spot and then insert the old * separator into the first slot in the subtree to the right. */ - ASSERT(where->bti_node->bth_core); zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; /* * We can ignore bti_before, because either way the value * should end up in bti_offset. */ - uint64_t off = where->bti_offset; + uint32_t off = where->bti_offset; zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; size_t size = tree->bt_elem_size; uint8_t *buf = kmem_alloc(size, KM_SLEEP); - bmov(node->btc_elems + off * size, buf, size); - bmov(value, node->btc_elems + off * size, size); + bcpy(node->btc_elems + off * size, buf, size); + bcpy(value, node->btc_elems + off * size, size); /* * Find the first slot in the subtree to the right, insert * there. */ zfs_btree_index_t new_idx; - VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); + VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, + NULL); ASSERT0(new_idx.bti_offset); - ASSERT(!new_idx.bti_node->bth_core); + ASSERT(!zfs_btree_is_core(new_idx.bti_node)); zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); kmem_free(buf, size); @@ -1075,7 +1181,7 @@ zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) ASSERT0(tree->bt_num_elems); return (NULL); } - return (zfs_btree_first_helper(tree->bt_root, where)); + return (zfs_btree_first_helper(tree, tree->bt_root, where)); } /* @@ -1088,7 +1194,7 @@ zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, { zfs_btree_hdr_t *node; - for (node = hdr; node->bth_core; node = + for (node = hdr; zfs_btree_is_core(node); node = ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) ; @@ -1098,7 +1204,8 @@ zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, where->bti_offset = node->bth_count - 1; where->bti_before = B_FALSE; } - return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size); + return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * + btree->bt_elem_size); } /* @@ -1131,8 +1238,8 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, return (NULL); } - uint64_t offset = idx->bti_offset; - if (!idx->bti_node->bth_core) { + uint32_t offset = idx->bti_offset; + if (!zfs_btree_is_core(idx->bti_node)) { /* * When finding the next element of an element in a leaf, * there are two cases. If the element isn't the last one in @@ -1143,20 +1250,21 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, * separator after our ancestor in its parent. */ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; - uint64_t new_off = offset + (idx->bti_before ? 0 : 1); + uint32_t new_off = offset + (idx->bti_before ? 0 : 1); if (leaf->btl_hdr.bth_count > new_off) { out_idx->bti_node = &leaf->btl_hdr; out_idx->bti_offset = new_off; out_idx->bti_before = B_FALSE; - return (leaf->btl_elems + new_off * tree->bt_elem_size); + return (leaf->btl_elems + (leaf->btl_hdr.bth_first + + new_off) * tree->bt_elem_size); } zfs_btree_hdr_t *prev = &leaf->btl_hdr; for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { zfs_btree_hdr_t *hdr = &node->btc_hdr; - ASSERT(hdr->bth_core); - uint64_t i = zfs_btree_find_parent_idx(tree, prev); + ASSERT(zfs_btree_is_core(hdr)); + uint32_t i = zfs_btree_find_parent_idx(tree, prev); if (done_func != NULL) done_func(tree, prev); if (i == hdr->bth_count) { @@ -1178,7 +1286,7 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, } /* If we were before an element in a core node, return that element. */ - ASSERT(idx->bti_node->bth_core); + ASSERT(zfs_btree_is_core(idx->bti_node)); zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; if (idx->bti_before) { out_idx->bti_before = B_FALSE; @@ -1190,7 +1298,7 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, * the subtree just to the right of the separator. */ zfs_btree_hdr_t *child = node->btc_children[offset + 1]; - return (zfs_btree_first_helper(child, out_idx)); + return (zfs_btree_first_helper(tree, child, out_idx)); } /* @@ -1217,8 +1325,8 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, return (NULL); } - uint64_t offset = idx->bti_offset; - if (!idx->bti_node->bth_core) { + uint32_t offset = idx->bti_offset; + if (!zfs_btree_is_core(idx->bti_node)) { /* * When finding the previous element of an element in a leaf, * there are two cases. If the element isn't the first one in @@ -1233,15 +1341,15 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, out_idx->bti_node = &leaf->btl_hdr; out_idx->bti_offset = offset - 1; out_idx->bti_before = B_FALSE; - return (leaf->btl_elems + (offset - 1) * - tree->bt_elem_size); + return (leaf->btl_elems + (leaf->btl_hdr.bth_first + + offset - 1) * tree->bt_elem_size); } zfs_btree_hdr_t *prev = &leaf->btl_hdr; for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { zfs_btree_hdr_t *hdr = &node->btc_hdr; - ASSERT(hdr->bth_core); - uint64_t i = zfs_btree_find_parent_idx(tree, prev); + ASSERT(zfs_btree_is_core(hdr)); + uint32_t i = zfs_btree_find_parent_idx(tree, prev); if (i == 0) { prev = hdr; continue; @@ -1262,7 +1370,7 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, * The previous element from one in a core node is the last element in * the subtree just to the left of the separator. */ - ASSERT(idx->bti_node->bth_core); + ASSERT(zfs_btree_is_core(idx->bti_node)); zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; zfs_btree_hdr_t *child = node->btc_children[offset]; return (zfs_btree_last_helper(tree, child, out_idx)); @@ -1279,13 +1387,14 @@ void * zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) { ASSERT(!idx->bti_before); - if (!idx->bti_node->bth_core) { + size_t size = tree->bt_elem_size; + if (!zfs_btree_is_core(idx->bti_node)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; - return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); + return (leaf->btl_elems + (leaf->btl_hdr.bth_first + + idx->bti_offset) * size); } - ASSERT(idx->bti_node->bth_core); zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; - return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); + return (node->btc_elems + idx->bti_offset * size); } /* Add the given value to the tree. Must not already be in the tree. */ @@ -1302,8 +1411,8 @@ static void zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) { tree->bt_num_nodes--; - if (!node->bth_core) { - kmem_cache_free(zfs_btree_leaf_cache, node); + if (!zfs_btree_is_core(node)) { + zfs_btree_leaf_free(tree, node); } else { kmem_free(node, sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * tree->bt_elem_size); @@ -1320,7 +1429,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_hdr_t *rm_hdr) { size_t size = tree->bt_elem_size; - uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1; + uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; zfs_btree_hdr_t *hdr = &node->btc_hdr; /* * If the node is the root node and rm_hdr is one of two children, @@ -1337,7 +1446,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, return; } - uint64_t idx; + uint32_t idx; for (idx = 0; idx <= hdr->bth_count; idx++) { if (node->btc_children[idx] == rm_hdr) break; @@ -1357,7 +1466,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, BSS_PARALLELOGRAM); hdr->bth_count--; - zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); return; } @@ -1378,13 +1487,13 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * implementing in the future for completeness' sake. */ zfs_btree_core_t *parent = hdr->bth_parent; - uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ - ASSERT(l_hdr->bth_core); + ASSERT(zfs_btree_is_core(l_hdr)); zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; /* @@ -1399,20 +1508,19 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bmov(separator, node->btc_elems, size); + bcpy(separator, node->btc_elems, size); /* Move the last child of neighbor to our first child slot. */ - zfs_btree_hdr_t **take_child = neighbor->btc_children + - l_hdr->bth_count; - bmov(take_child, node->btc_children, sizeof (*take_child)); + node->btc_children[0] = + neighbor->btc_children[l_hdr->bth_count]; node->btc_children[0]->bth_parent = node; /* Move the last element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems + (l_hdr->bth_count - 1) * size; - bmov(take_elem, separator, size); + bcpy(take_elem, separator, size); l_hdr->bth_count--; - zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); return; } @@ -1420,7 +1528,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ - ASSERT(r_hdr->bth_core); + ASSERT(zfs_btree_is_core(r_hdr)); zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; /* @@ -1435,21 +1543,19 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * element spot in node. */ uint8_t *separator = parent->btc_elems + parent_idx * size; - bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size, + bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, size); /* * Move the first child of neighbor to the last child spot in * node. */ - zfs_btree_hdr_t **take_child = neighbor->btc_children; - bmov(take_child, node->btc_children + hdr->bth_count, - sizeof (*take_child)); + node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; node->btc_children[hdr->bth_count]->bth_parent = node; /* Move the first element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems; - bmov(take_elem, separator, size); + bcpy(take_elem, separator, size); r_hdr->bth_count--; /* @@ -1458,7 +1564,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, */ bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, BSS_TRAPEZOID); - zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); return; } @@ -1473,7 +1579,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * merging. */ zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; - uint64_t new_idx = idx; + uint32_t new_idx = idx; if (l_hdr != NULL) { keep_hdr = l_hdr; new_rm_hdr = hdr; @@ -1485,14 +1591,14 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, parent_idx++; } - ASSERT(keep_hdr->bth_core); - ASSERT(new_rm_hdr->bth_core); + ASSERT(zfs_btree_is_core(keep_hdr)); + ASSERT(zfs_btree_is_core(new_rm_hdr)); zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) { + for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { zfs_btree_verify_poison_at(tree, keep_hdr, keep_hdr->bth_count + i); } @@ -1502,14 +1608,14 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bmov(separator, e_out, size); + bcpy(separator, e_out, size); keep_hdr->bth_count++; /* Move all our elements and children into the left node. */ bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, keep_hdr->bth_count, BSS_TRAPEZOID); - uint64_t old_count = keep_hdr->bth_count; + uint32_t old_count = keep_hdr->bth_count; /* Update bookkeeping */ keep_hdr->bth_count += new_rm_hdr->bth_count; @@ -1527,17 +1633,17 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, /* Reparent all our children to point to the left node. */ zfs_btree_hdr_t **new_start = keep->btc_children + old_count - 1; - for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) + for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) new_start[i]->bth_parent = keep; - for (int i = 0; i <= keep_hdr->bth_count; i++) { + for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); ASSERT3P(keep->btc_children[i], !=, rm_hdr); } - zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); new_rm_hdr->bth_count = 0; - zfs_btree_node_destroy(tree, new_rm_hdr); zfs_btree_remove_from_node(tree, parent, new_rm_hdr); + zfs_btree_node_destroy(tree, new_rm_hdr); } /* Remove the element at the specific location. */ @@ -1546,9 +1652,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = where->bti_node; - uint64_t idx = where->bti_offset; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - - sizeof (zfs_btree_hdr_t)) / size, 2); + uint32_t idx = where->bti_offset; ASSERT(!where->bti_before); if (tree->bt_bulk != NULL) { @@ -1560,7 +1664,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) */ uint8_t *value = zfs_btree_get(tree, where); uint8_t *tmp = kmem_alloc(size, KM_SLEEP); - bmov(value, tmp, size); + bcpy(value, tmp, size); zfs_btree_bulk_finish(tree); VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); kmem_free(tmp, size); @@ -1575,14 +1679,14 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) * makes the rebalance logic not need to be recursive both upwards and * downwards. */ - if (hdr->bth_core) { + if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; void *new_value = zfs_btree_last_helper(tree, left_subtree, where); ASSERT3P(new_value, !=, NULL); - bmov(new_value, node->btc_elems + idx * size, size); + bcpy(new_value, node->btc_elems + idx * size, size); hdr = where->bti_node; idx = where->bti_offset; @@ -1594,19 +1698,18 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) * elements after the idx to the left. After that, we rebalance if * needed. */ - ASSERT(!hdr->bth_core); + ASSERT(!zfs_btree_is_core(hdr)); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; ASSERT3U(hdr->bth_count, >, 0); - uint64_t min_count = (capacity / 2) - 1; + uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; /* * If we're over the minimum size or this is the root, just overwrite * the value and return. */ if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { - hdr->bth_count--; - bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); + bt_shrink_leaf(tree, leaf, idx, 1); if (hdr->bth_parent == NULL) { ASSERT0(tree->bt_height); if (hdr->bth_count == 0) { @@ -1615,8 +1718,6 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) zfs_btree_node_destroy(tree, &leaf->btl_hdr); } } - if (tree->bt_root != NULL) - zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); zfs_btree_verify(tree); return; } @@ -1636,33 +1737,33 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) * worth implementing in the future for completeness' sake. */ zfs_btree_core_t *parent = hdr->bth_parent; - uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ - ASSERT(!l_hdr->bth_core); + ASSERT(!zfs_btree_is_core(l_hdr)); + zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; /* * Move our elements back by one spot to make room for the * stolen element and overwrite the element being removed. */ - bt_shift_leaf_right(tree, leaf, 0, idx); + bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); + + /* Move the separator to our first spot. */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + - (l_hdr->bth_count - 1) * size; - /* Move the separator to our first spot. */ - bmov(separator, leaf->btl_elems, size); + bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); /* Move our neighbor's last element to the separator. */ - bmov(take_elem, separator, size); - - /* Update the bookkeeping. */ - l_hdr->bth_count--; - zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + uint8_t *take_elem = neighbor->btl_elems + + (l_hdr->bth_first + l_hdr->bth_count - 1) * size; + bcpy(take_elem, separator, size); + /* Delete our neighbor's last element. */ + bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); zfs_btree_verify(tree); return; } @@ -1671,7 +1772,7 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ - ASSERT(!r_hdr->bth_core); + ASSERT(!zfs_btree_is_core(r_hdr)); zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; /* @@ -1679,96 +1780,81 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) * by one spot to make room for the stolen element and * overwrite the element being removed. */ - bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx - - 1); + bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, + 1, BSD_LEFT); - uint8_t *separator = parent->btc_elems + parent_idx * size; - uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; /* Move the separator between us to our last spot. */ - bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, - size); + uint8_t *separator = parent->btc_elems + parent_idx * size; + bcpy(separator, leaf->btl_elems + (hdr->bth_first + + hdr->bth_count - 1) * size, size); /* Move our neighbor's first element to the separator. */ - bmov(take_elem, separator, size); + uint8_t *take_elem = neighbor->btl_elems + + r_hdr->bth_first * size; + bcpy(take_elem, separator, size); - /* Update the bookkeeping. */ - r_hdr->bth_count--; - - /* - * Move our neighbors elements forwards to overwrite the - * stolen element. - */ - bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); - zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + /* Delete our neighbor's first element. */ + bt_shrink_leaf(tree, neighbor, 0, 1); zfs_btree_verify(tree); return; } /* * In this case, neither of our neighbors can spare an element, so we - * need to merge with one of them. We prefer the left one, - * arbitrarily. Move the separator into the leftmost merging node + * need to merge with one of them. We prefer the left one, arbitrarily. + * After remove we move the separator into the leftmost merging node * (which may be us or the left neighbor), and then move the right * merging node's elements. Once that's done, we go back and delete * the element we're removing. Finally, go into the parent and delete * the right merging node and the separator. This may cause further * merging. */ - zfs_btree_hdr_t *rm_hdr, *keep_hdr; - uint64_t new_idx = idx; + zfs_btree_hdr_t *rm_hdr, *k_hdr; if (l_hdr != NULL) { - keep_hdr = l_hdr; + k_hdr = l_hdr; rm_hdr = hdr; - new_idx += keep_hdr->bth_count + 1; // 449 } else { ASSERT3P(r_hdr, !=, NULL); - keep_hdr = hdr; + k_hdr = hdr; rm_hdr = r_hdr; parent_idx++; } - - ASSERT(!keep_hdr->bth_core); - ASSERT(!rm_hdr->bth_core); - ASSERT3U(keep_hdr->bth_count, ==, min_count); + ASSERT(!zfs_btree_is_core(k_hdr)); + ASSERT(!zfs_btree_is_core(rm_hdr)); + ASSERT3U(k_hdr->bth_count, ==, min_count); ASSERT3U(rm_hdr->bth_count, ==, min_count); - - zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr; + zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < rm_hdr->bth_count + 1; i++) { - zfs_btree_verify_poison_at(tree, keep_hdr, - keep_hdr->bth_count + i); + for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, k_hdr, + k_hdr->bth_count + i); } } + /* - * Move the separator into the first open spot in the left - * neighbor. + * Remove the value from the node. It will go below the minimum, + * but we'll fix it in no time. */ - uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; - uint8_t *separator = parent->btc_elems + (parent_idx - 1) * - size; - bmov(separator, out, size); - keep_hdr->bth_count++; + bt_shrink_leaf(tree, leaf, idx, 1); - /* Move our elements to the left neighbor. */ - bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, - keep_hdr->bth_count); + /* Prepare space for elements to be moved from the right. */ + uint32_t k_count = k_hdr->bth_count; + bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); + ASSERT3U(k_hdr->bth_count, ==, min_count * 2); - /* Update the bookkeeping. */ - keep_hdr->bth_count += rm_hdr->bth_count; - ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1); + /* Move the separator into the first open spot. */ + uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; + bcpy(separator, out, size); - /* Remove the value from the node */ - keep_hdr->bth_count--; - bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count - - new_idx); - zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + /* Move our elements to the left neighbor. */ + bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); - rm_hdr->bth_count = 0; - zfs_btree_node_destroy(tree, rm_hdr); /* Remove the emptied node from the parent. */ zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_node_destroy(tree, rm_hdr); zfs_btree_verify(tree); } @@ -1831,11 +1917,10 @@ zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) static void zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { - if (hdr->bth_core) { + if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; - for (int i = 0; i <= hdr->bth_count; i++) { + for (uint32_t i = 0; i <= hdr->bth_count; i++) zfs_btree_clear_helper(tree, btc->btc_children[i]); - } } zfs_btree_node_destroy(tree, hdr); @@ -1868,11 +1953,11 @@ zfs_btree_destroy(zfs_btree_t *tree) static void zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { - if (!hdr->bth_core) + if (!zfs_btree_is_core(hdr)) return; zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; - for (int i = 0; i <= hdr->bth_count; i++) { + for (uint32_t i = 0; i <= hdr->bth_count; i++) { VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); } @@ -1897,11 +1982,10 @@ zfs_btree_verify_pointers(zfs_btree_t *tree) static uint64_t zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { - if (!hdr->bth_core) { - if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - - sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); - VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1); + if (!zfs_btree_is_core(hdr)) { + if (tree->bt_root != hdr && tree->bt_bulk && + hdr != &tree->bt_bulk->btl_hdr) { + VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); } return (hdr->bth_count); @@ -1911,7 +1995,7 @@ zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) uint64_t ret = hdr->bth_count; if (tree->bt_root != hdr && tree->bt_bulk == NULL) VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); - for (int i = 0; i <= hdr->bth_count; i++) { + for (uint32_t i = 0; i <= hdr->bth_count; i++) { ret += zfs_btree_verify_counts_helper(tree, node->btc_children[i]); } @@ -1941,17 +2025,16 @@ zfs_btree_verify_counts(zfs_btree_t *tree) */ static uint64_t zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, - int64_t height) + int32_t height) { - if (!hdr->bth_core) { + if (!zfs_btree_is_core(hdr)) { VERIFY0(height); return (1); } - VERIFY(hdr->bth_core); zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = 1; - for (int i = 0; i <= hdr->bth_count; i++) { + for (uint32_t i = 0; i <= hdr->bth_count; i++) { ret += zfs_btree_verify_height_helper(tree, node->btc_children[i], height - 1); } @@ -1983,24 +2066,26 @@ static void zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; - if (!hdr->bth_core) { + if (!zfs_btree_is_core(hdr)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; - for (int i = 1; i < hdr->bth_count; i++) { - VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * - size, leaf->btl_elems + i * size), ==, -1); + for (uint32_t i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(leaf->btl_elems + + (hdr->bth_first + i - 1) * size, + leaf->btl_elems + + (hdr->bth_first + i) * size), ==, -1); } return; } zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; - for (int i = 1; i < hdr->bth_count; i++) { + for (uint32_t i = 1; i < hdr->bth_count; i++) { VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, node->btc_elems + i * size), ==, -1); } - for (int i = 0; i < hdr->bth_count; i++) { + for (uint32_t i = 0; i < hdr->bth_count; i++) { uint8_t *left_child_last = NULL; zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; - if (left_child_hdr->bth_core) { + if (zfs_btree_is_core(left_child_hdr)) { zfs_btree_core_t *left_child = (zfs_btree_core_t *)left_child_hdr; left_child_last = left_child->btc_elems + @@ -2009,40 +2094,39 @@ zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) zfs_btree_leaf_t *left_child = (zfs_btree_leaf_t *)left_child_hdr; left_child_last = left_child->btl_elems + - (left_child_hdr->bth_count - 1) * size; + (left_child_hdr->bth_first + + left_child_hdr->bth_count - 1) * size; } - if (tree->bt_compar(node->btc_elems + i * size, - left_child_last) != 1) { + int comp = tree->bt_compar(node->btc_elems + i * size, + left_child_last); + if (comp <= 0) { panic("btree: compar returned %d (expected 1) at " - "%px %d: compar(%px, %px)", tree->bt_compar( - node->btc_elems + i * size, left_child_last), - (void *)node, i, (void *)(node->btc_elems + i * - size), (void *)left_child_last); + "%px %d: compar(%px, %px)", comp, node, i, + node->btc_elems + i * size, left_child_last); } uint8_t *right_child_first = NULL; zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; - if (right_child_hdr->bth_core) { + if (zfs_btree_is_core(right_child_hdr)) { zfs_btree_core_t *right_child = (zfs_btree_core_t *)right_child_hdr; right_child_first = right_child->btc_elems; } else { zfs_btree_leaf_t *right_child = (zfs_btree_leaf_t *)right_child_hdr; - right_child_first = right_child->btl_elems; + right_child_first = right_child->btl_elems + + right_child_hdr->bth_first * size; } - if (tree->bt_compar(node->btc_elems + i * size, - right_child_first) != -1) { + comp = tree->bt_compar(node->btc_elems + i * size, + right_child_first); + if (comp >= 0) { panic("btree: compar returned %d (expected -1) at " - "%px %d: compar(%px, %px)", tree->bt_compar( - node->btc_elems + i * size, right_child_first), - (void *)node, i, (void *)(node->btc_elems + i * - size), (void *)right_child_first); + "%px %d: compar(%px, %px)", comp, node, i, + node->btc_elems + i * size, right_child_first); } } - for (int i = 0; i <= hdr->bth_count; i++) { + for (uint32_t i = 0; i <= hdr->bth_count; i++) zfs_btree_verify_order_helper(tree, node->btc_children[i]); - } } /* Check that all elements in the tree are in sorted order. */ @@ -2063,27 +2147,28 @@ static void zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; - if (!hdr->bth_core) { + if (!zfs_btree_is_core(hdr)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; - uint8_t val = 0x0f; - for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - - sizeof (zfs_btree_hdr_t); i++) { - VERIFY3U(leaf->btl_elems[i], ==, val); - } + for (size_t i = 0; i < hdr->bth_first * size; i++) + VERIFY3U(leaf->btl_elems[i], ==, 0x0f); + size_t esize = tree->bt_leaf_size - + offsetof(zfs_btree_leaf_t, btl_elems); + for (size_t i = (hdr->bth_first + hdr->bth_count) * size; + i < esize; i++) + VERIFY3U(leaf->btl_elems[i], ==, 0x0f); } else { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; - uint8_t val = 0x0f; - for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; - i++) { - VERIFY3U(node->btc_elems[i], ==, val); - } + for (size_t i = hdr->bth_count * size; + i < BTREE_CORE_ELEMS * size; i++) + VERIFY3U(node->btc_elems[i], ==, 0x0f); - for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; + i++) { VERIFY3P(node->btc_children[i], ==, (zfs_btree_hdr_t *)BTREE_POISON); } - for (int i = 0; i <= hdr->bth_count; i++) { + for (uint32_t i = 0; i <= hdr->bth_count; i++) { zfs_btree_verify_poison_helper(tree, node->btc_children[i]); } @@ -2122,3 +2207,9 @@ zfs_btree_verify(zfs_btree_t *tree) return; zfs_btree_verify_poison(tree); } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, + "Enable btree verification. Levels above 4 require ZFS be built " + "with debugging"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c index 3fbb24ddef5e..2ac058fd2c93 100644 --- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c +++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -37,18 +37,37 @@ static dataset_kstat_values_t empty_dataset_kstats = { { "nread", KSTAT_DATA_UINT64 }, { "nunlinks", KSTAT_DATA_UINT64 }, { "nunlinked", KSTAT_DATA_UINT64 }, + { + { "zil_commit_count", KSTAT_DATA_UINT64 }, + { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, + { "zil_itx_count", KSTAT_DATA_UINT64 }, + { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, + { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, + { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, + { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 } + } }; static int dataset_kstats_update(kstat_t *ksp, int rw) { dataset_kstats_t *dk = ksp->ks_private; - ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data); + dataset_kstat_values_t *dkv = ksp->ks_data; + ASSERT3P(dk->dk_kstats->ks_data, ==, dkv); if (rw == KSTAT_WRITE) return (EACCES); - dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; dkv->dkv_writes.value.ui64 = wmsum_value(&dk->dk_sums.dss_writes); dkv->dkv_nwritten.value.ui64 = @@ -62,10 +81,12 @@ dataset_kstats_update(kstat_t *ksp, int rw) dkv->dkv_nunlinked.value.ui64 = wmsum_value(&dk->dk_sums.dss_nunlinked); + zil_kstat_values_update(&dkv->dkv_zil_stats, &dk->dk_zil_sums); + return (0); } -void +int dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) { /* @@ -75,7 +96,7 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) * a filesystem with many snapshots, we skip them for now. */ if (dmu_objset_is_snapshot(objset)) - return; + return (0); /* * At the time of this writing, KSTAT_STRLEN is 255 in Linux, @@ -94,13 +115,13 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) zfs_dbgmsg("failed to create dataset kstat for objset %lld: " " snprintf() for kstat module name returned %d", (unsigned long long)dmu_objset_id(objset), n); - return; + return (SET_ERROR(EINVAL)); } else if (n >= KSTAT_STRLEN) { zfs_dbgmsg("failed to create dataset kstat for objset %lld: " "kstat module name length (%d) exceeds limit (%d)", (unsigned long long)dmu_objset_id(objset), n, KSTAT_STRLEN); - return; + return (SET_ERROR(ENAMETOOLONG)); } char kstat_name[KSTAT_STRLEN]; @@ -110,20 +131,25 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) zfs_dbgmsg("failed to create dataset kstat for objset %lld: " " snprintf() for kstat name returned %d", (unsigned long long)dmu_objset_id(objset), n); - return; + return (SET_ERROR(EINVAL)); + } else if (n >= KSTAT_STRLEN) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + "kstat name length (%d) exceeds limit (%d)", + (unsigned long long)dmu_objset_id(objset), + n, KSTAT_STRLEN); + return (SET_ERROR(ENAMETOOLONG)); } - ASSERT3U(n, <, KSTAT_STRLEN); kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name, "dataset", KSTAT_TYPE_NAMED, sizeof (empty_dataset_kstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (kstat == NULL) - return; + return (SET_ERROR(ENOMEM)); dataset_kstat_values_t *dk_kstats = kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP); - bcopy(&empty_dataset_kstats, dk_kstats, + memcpy(dk_kstats, &empty_dataset_kstats, sizeof (empty_dataset_kstats)); char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -137,15 +163,17 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) kstat->ks_private = dk; kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN; - kstat_install(kstat); - dk->dk_kstats = kstat; - wmsum_init(&dk->dk_sums.dss_writes, 0); wmsum_init(&dk->dk_sums.dss_nwritten, 0); wmsum_init(&dk->dk_sums.dss_reads, 0); wmsum_init(&dk->dk_sums.dss_nread, 0); wmsum_init(&dk->dk_sums.dss_nunlinks, 0); wmsum_init(&dk->dk_sums.dss_nunlinked, 0); + zil_sums_init(&dk->dk_zil_sums); + + dk->dk_kstats = kstat; + kstat_install(kstat); + return (0); } void @@ -155,19 +183,31 @@ dataset_kstats_destroy(dataset_kstats_t *dk) return; dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + kstat_delete(dk->dk_kstats); + dk->dk_kstats = NULL; kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name), KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); kmem_free(dkv, sizeof (empty_dataset_kstats)); - kstat_delete(dk->dk_kstats); - dk->dk_kstats = NULL; - wmsum_fini(&dk->dk_sums.dss_writes); wmsum_fini(&dk->dk_sums.dss_nwritten); wmsum_fini(&dk->dk_sums.dss_reads); wmsum_fini(&dk->dk_sums.dss_nread); wmsum_fini(&dk->dk_sums.dss_nunlinks); wmsum_fini(&dk->dk_sums.dss_nunlinked); + zil_sums_fini(&dk->dk_zil_sums); +} + +void +dataset_kstats_rename(dataset_kstats_t *dk, const char *name) +{ + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + char *ds_name; + + ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name); + ASSERT3S(ds_name, !=, NULL); + (void) strlcpy(ds_name, name, + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); } void diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index fe54da425286..56fe2c4dbe30 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -26,6 +26,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include <sys/zfs_context.h> @@ -49,13 +50,14 @@ #include <sys/trace_zfs.h> #include <sys/callb.h> #include <sys/abd.h> +#include <sys/brt.h> #include <sys/vdev.h> #include <cityhash.h> #include <sys/spa_impl.h> #include <sys/wmsum.h> #include <sys/vdev_impl.h> -kstat_t *dbuf_ksp; +static kstat_t *dbuf_ksp; typedef struct dbuf_stats { /* @@ -100,6 +102,11 @@ typedef struct dbuf_stats { */ kstat_named_t hash_insert_race; /* + * Number of entries in the hash table dbuf and mutex arrays. + */ + kstat_named_t hash_table_count; + kstat_named_t hash_mutex_count; + /* * Statistics about the size of the metadata dbuf cache. */ kstat_named_t metadata_cache_count; @@ -131,6 +138,8 @@ dbuf_stats_t dbuf_stats = { { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, + { "hash_table_count", KSTAT_DATA_UINT64 }, + { "hash_mutex_count", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, @@ -152,13 +161,13 @@ struct { } dbuf_sums; #define DBUF_STAT_INCR(stat, val) \ - wmsum_add(&dbuf_sums.stat, val); + wmsum_add(&dbuf_sums.stat, val) #define DBUF_STAT_DECR(stat, val) \ - DBUF_STAT_INCR(stat, -(val)); + DBUF_STAT_INCR(stat, -(val)) #define DBUF_STAT_BUMP(stat) \ - DBUF_STAT_INCR(stat, 1); + DBUF_STAT_INCR(stat, 1) #define DBUF_STAT_BUMPDOWN(stat) \ - DBUF_STAT_INCR(stat, -1); + DBUF_STAT_INCR(stat, -1) #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ @@ -166,15 +175,8 @@ struct { continue; \ } -static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); -static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); - -extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, - dmu_buf_evict_func_t *evict_func_sync, - dmu_buf_evict_func_t *evict_func_async, - dmu_buf_t **clear_on_evict_dbufp); /* * Global data structures and functions for the dbuf cache. @@ -225,12 +227,15 @@ typedef struct dbuf_cache { dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ -unsigned long dbuf_cache_max_bytes = ULONG_MAX; -unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX; +static uint64_t dbuf_cache_max_bytes = UINT64_MAX; +static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX; /* Set the default sizes of the caches to log2 fraction of arc size */ -int dbuf_cache_shift = 5; -int dbuf_metadata_cache_shift = 6; +static uint_t dbuf_cache_shift = 5; +static uint_t dbuf_metadata_cache_shift = 6; + +/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ +static uint_t dbuf_mutex_cache_shift = 0; static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); @@ -277,18 +282,18 @@ static unsigned long dbuf_metadata_cache_target_bytes(void); /* * The percentage above and below the maximum cache size. */ -uint_t dbuf_cache_hiwater_pct = 10; -uint_t dbuf_cache_lowater_pct = 10; +static uint_t dbuf_cache_hiwater_pct = 10; +static uint_t dbuf_cache_lowater_pct = 10; -/* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { + (void) unused, (void) kmflag; dmu_buf_impl_t *db = vdb; - bzero(db, sizeof (dmu_buf_impl_t)); + memset(db, 0, sizeof (dmu_buf_impl_t)); - mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); - rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&db->db_mtx, NULL, MUTEX_NOLOCKDEP, NULL); + rw_init(&db->db_rwlock, NULL, RW_NOLOCKDEP, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); @@ -296,10 +301,10 @@ dbuf_cons(void *vdb, void *unused, int kmflag) return (0); } -/* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { + (void) unused; dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); rw_destroy(&db->db_rwlock); @@ -334,7 +339,8 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * -dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) +dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid, + uint64_t *hash_out) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv; @@ -356,6 +362,8 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); + if (hash_out != NULL) + *hash_out = hv; return (NULL); } @@ -390,13 +398,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db) objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; - uint64_t blkid, hv, idx; + uint64_t blkid, idx; dmu_buf_impl_t *dbf; uint32_t i; blkid = db->db_blkid; - hv = dbuf_hash(os, obj, level, blkid); - idx = hv & h->hash_table_mask; + ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash); + idx = db->db_hash & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; @@ -470,12 +478,12 @@ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv, idx; + uint64_t idx; dmu_buf_impl_t *dbf, **dbp; - hv = dbuf_hash(db->db_objset, db->db.db_object, - db->db_level, db->db_blkid); - idx = hv & h->hash_table_mask; + ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level, + db->db_blkid), ==, db->db_hash); + idx = db->db_hash & h->hash_table_mask; /* * We mustn't hold db_mtx to maintain lock ordering: @@ -560,6 +568,21 @@ dbuf_evict_user(dmu_buf_impl_t *db) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif + if (db->db_caching_status != DB_NO_CACHE) { + /* + * This is a cached dbuf, so the size of the user data is + * included in its cached amount. We adjust it here because the + * user data has already been detached from the dbuf, and the + * sync functions are not supposed to touch it (the dbuf might + * not exist anymore by the time the sync functions run. + */ + uint64_t size = dbu->dbu_size; + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, size, dbu); + if (db->db_caching_status == DB_DBUF_CACHE) + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size); + } + /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for @@ -607,58 +630,58 @@ dbuf_is_metadata(dmu_buf_impl_t *db) boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db) { - vdev_t *vd = NULL; - zfs_cache_type_t cache = db->db_objset->os_secondary_cache; - blkptr_t *bp = db->db_blkptr; - - if (bp != NULL && !BP_IS_HOLE(bp)) { + if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL || + (db->db_objset->os_secondary_cache == + ZFS_CACHE_METADATA && dbuf_is_metadata(db))) { + if (l2arc_exclude_special == 0) + return (B_TRUE); + + blkptr_t *bp = db->db_blkptr; + if (bp == NULL || BP_IS_HOLE(bp)) + return (B_FALSE); uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; + vdev_t *vd = NULL; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; - if (cache == ZFS_CACHE_ALL || - (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) { - if (vd == NULL) - return (B_TRUE); + if (vd == NULL) + return (B_TRUE); - if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && - vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || - l2arc_exclude_special == 0) - return (B_TRUE); - } + if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) + return (B_TRUE); } - return (B_FALSE); } static inline boolean_t dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level) { - vdev_t *vd = NULL; - zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache; - - if (bp != NULL && !BP_IS_HOLE(bp)) { + if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || + (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA && + (level > 0 || + DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) { + if (l2arc_exclude_special == 0) + return (B_TRUE); + + if (bp == NULL || BP_IS_HOLE(bp)) + return (B_FALSE); uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev; + vdev_t *vd = NULL; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; - if (cache == ZFS_CACHE_ALL || ((level > 0 || - DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) && - cache == ZFS_CACHE_METADATA)) { - if (vd == NULL) - return (B_TRUE); + if (vd == NULL) + return (B_TRUE); - if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && - vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || - l2arc_exclude_special == 0) - return (B_TRUE); - } + if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) + return (B_TRUE); } - return (B_FALSE); } @@ -745,7 +768,7 @@ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); - multilist_sublist_t *mls = multilist_sublist_lock( + multilist_sublist_t *mls = multilist_sublist_lock_idx( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); @@ -761,12 +784,15 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); + uint64_t size = db->db.db_size; + uint64_t usize = dmu_buf_user_size(&db->db); + (void) zfs_refcount_remove_many( + &dbuf_caches[DB_DBUF_CACHE].size, size, db); (void) zfs_refcount_remove_many( - &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); + &dbuf_caches[DB_DBUF_CACHE].size, usize, db->db_user); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); - DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size + usize); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); @@ -783,10 +809,10 @@ dbuf_evict_one(void) * of the dbuf cache is at or below the maximum size. Once the dbuf is aged * out of the cache it is destroyed and becomes eligible for arc eviction. */ -/* ARGSUSED */ -static void +static __attribute__((noreturn)) void dbuf_evict_thread(void *unused) { + (void) unused; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); @@ -822,7 +848,7 @@ dbuf_evict_thread(void *unused) /* * Wake up the dbuf eviction thread if the dbuf cache is at its max size. * If the dbuf cache is at its high water mark, then evict a dbuf from the - * dbuf cache using the callers context. + * dbuf cache using the caller's context. */ static void dbuf_evict_notify(uint64_t size) @@ -843,6 +869,7 @@ static int dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; + dbuf_hash_table_t *h = &dbuf_hash_table; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); @@ -872,6 +899,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) wmsum_value(&dbuf_sums.hash_chains); ds->hash_insert_race.value.ui64 = wmsum_value(&dbuf_sums.hash_insert_race); + ds->hash_table_count.value.ui64 = h->hash_table_mask + 1; + ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1; ds->metadata_cache_count.value.ui64 = wmsum_value(&dbuf_sums.metadata_cache_count); ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( @@ -884,9 +913,8 @@ dbuf_kstat_update(kstat_t *ksp, int rw) void dbuf_init(void) { - uint64_t hsize = 1ULL << 16; + uint64_t hmsize, hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; - int i; /* * The hash table is big enough to fill one eighth of physical memory @@ -897,30 +925,43 @@ dbuf_init(void) while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) hsize <<= 1; -retry: - h->hash_table_mask = hsize - 1; -#if defined(_KERNEL) + h->hash_table = NULL; + while (h->hash_table == NULL) { + h->hash_table_mask = hsize - 1; + + h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); + if (h->hash_table == NULL) + hsize >>= 1; + + ASSERT3U(hsize, >=, 1ULL << 10); + } + /* - * Large allocations which do not require contiguous pages - * should be using vmem_alloc() in the linux kernel + * The hash table buckets are protected by an array of mutexes where + * each mutex is reponsible for protecting 128 buckets. A minimum + * array size of 8192 is targeted to avoid contention. */ - h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); -#else - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); -#endif - if (h->hash_table == NULL) { - /* XXX - we should really return an error instead of assert */ - ASSERT(hsize > (1ULL << 10)); - hsize >>= 1; - goto retry; + if (dbuf_mutex_cache_shift == 0) + hmsize = MAX(hsize >> 7, 1ULL << 13); + else + hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24); + + h->hash_mutexes = NULL; + while (h->hash_mutexes == NULL) { + h->hash_mutex_mask = hmsize - 1; + + h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t), + KM_SLEEP); + if (h->hash_mutexes == NULL) + hmsize >>= 1; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + for (int i = 0; i < hmsize; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL); dbuf_stats_init(h); @@ -946,7 +987,7 @@ retry: wmsum_init(&dbuf_sums.cache_count, 0); wmsum_init(&dbuf_sums.cache_total_evicts, 0); - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } @@ -962,7 +1003,7 @@ retry: KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); dbuf_stats.cache_levels[i].data_type = @@ -982,21 +1023,16 @@ void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; - int i; dbuf_stats_destroy(); - for (i = 0; i < DBUF_MUTEXES; i++) + for (int i = 0; i < (h->hash_mutex_mask + 1); i++) mutex_destroy(&h->hash_mutexes[i]); -#if defined(_KERNEL) - /* - * Large allocations which do not require contiguous pages - * should be using vmem_free() in the linux kernel - */ + vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -#else - kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); -#endif + vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) * + sizeof (kmutex_t)); + kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); @@ -1023,7 +1059,7 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.cache_count); wmsum_fini(&dbuf_sums.cache_total_evicts); - for (i = 0; i < DN_MAX_LEVELS; i++) { + for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } @@ -1137,7 +1173,7 @@ dbuf_verify(dmu_buf_impl_t *db) if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && - db->db_state != DB_FILL && !dn->dn_free_txg) { + db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that @@ -1183,7 +1219,7 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); - ASSERT0(bp->blk_phys_birth); + ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); } } } @@ -1240,7 +1276,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, B_FALSE, blksz); - bcopy(db->db.db_data, abuf->b_data, blksz); + memcpy(abuf->b_data, db->db.db_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); @@ -1302,7 +1338,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) * used when modifying or reading db_blkptr. */ db_lock_type_t -dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) +dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag) { enum db_lock_type ret = DLT_NONE; if (db->db_parent != NULL) { @@ -1327,7 +1363,7 @@ dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) * panic if we didn't pass the lock type in. */ void -dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag) +dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag) { if (type == DLT_PARENT) rw_exit(&db->db_parent->db_rwlock); @@ -1339,6 +1375,7 @@ static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) { + (void) zb, (void) bp; dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); @@ -1360,7 +1397,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, /* freed in flight */ ASSERT(zio == NULL || zio->io_error == 0); arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); + memset(buf->b_data, 0, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); @@ -1383,13 +1420,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, * a decrypted block. Otherwise success. */ static int -dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) { - int bonuslen, max_bonuslen, err; - - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err) - return (err); + int bonuslen, max_bonuslen; bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); @@ -1399,16 +1432,16 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) - bzero(db->db.db_data, max_bonuslen); + memset(db->db.db_data, 0, max_bonuslen); if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "bonus buffer filled"); return (0); } static void -dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) +dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp) { blkptr_t *bps = db->db.db_data; uint32_t indbs = 1ULL << dn->dn_indblkshift; @@ -1417,12 +1450,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) for (int i = 0; i < n_bps; i++) { blkptr_t *bp = &bps[i]; - ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs); - BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? - dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); - BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); - BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); - BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); + ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs); + BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ? + dn->dn_datablksz : BP_GET_LSIZE(dbbp)); + BP_SET_TYPE(bp, BP_GET_TYPE(dbbp)); + BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1); + BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0); } } @@ -1432,30 +1465,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) * was taken, ENOENT if no action was taken. */ static int -dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) { ASSERT(MUTEX_HELD(&db->db_mtx)); - int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); + int is_hole = bp == NULL || BP_IS_HOLE(bp); /* * For level 0 blocks only, if the above check fails: * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ - if (!is_hole && db->db_level == 0) { - is_hole = dnode_block_freed(dn, db->db_blkid) || - BP_IS_HOLE(db->db_blkptr); - } + if (!is_hole && db->db_level == 0) + is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp); if (is_hole) { dbuf_set_data(db, dbuf_alloc_arcbuf(db)); - bzero(db->db.db_data, db->db.db_size); + memset(db->db.db_data, 0, db->db.db_size); - if (db->db_blkptr != NULL && db->db_level > 0 && - BP_IS_HOLE(db->db_blkptr) && - db->db_blkptr->blk_birth != 0) { - dbuf_handle_indirect_hole(db, dn); + if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) && + BP_GET_LOGICAL_BIRTH(bp) != 0) { + dbuf_handle_indirect_hole(db, dn, bp); } db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "hole read satisfied"); @@ -1477,32 +1507,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int -dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { - int err = 0; objset_t *os = db->db_objset; - arc_buf_t *dnode_abuf; - dnode_t *dn; + dmu_buf_impl_t *dndb; + arc_buf_t *dnbuf; zbookmark_phys_t zb; + int err; - ASSERT(MUTEX_HELD(&db->db_mtx)); + if ((flags & DB_RF_NO_DECRYPT) != 0 || + !os->os_encrypted || os->os_raw_receive || + (dndb = dn->dn_dbuf) == NULL) + return (0); - if (!os->os_encrypted || os->os_raw_receive || - (flags & DB_RF_NO_DECRYPT) != 0) + dnbuf = dndb->db_buf; + if (!arc_is_encrypted(dnbuf)) return (0); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; + mutex_enter(&dndb->db_mtx); - if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { - DB_DNODE_EXIT(db); - return (0); - } + /* + * Since dnode buffer is modified by sync process, there can be only + * one copy of it. It means we can not modify (decrypt) it while it + * is being written. I don't see how this may happen now, since + * encrypted dnode writes by receive should be completed before any + * plain-text reads due to txg wait, but better be safe than sorry. + */ + while (1) { + if (!arc_is_encrypted(dnbuf)) { + mutex_exit(&dndb->db_mtx); + return (0); + } + dbuf_dirty_record_t *dr = dndb->db_data_pending; + if (dr == NULL || dr->dt.dl.dr_data != dnbuf) + break; + cv_wait(&dndb->db_changed, &dndb->db_mtx); + }; SET_BOOKMARK(&zb, dmu_objset_id(os), - DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); - err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); + DMU_META_DNODE_OBJECT, 0, dndb->db_blkid); + err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE); /* * An error code of EACCES tells us that the key is still not @@ -1515,7 +1559,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) err = 0; - DB_DNODE_EXIT(db); + mutex_exit(&dndb->db_mtx); return (err); } @@ -1525,39 +1569,63 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) * returning. */ static int -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, - db_lock_type_t dblt, void *tag) +dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, + db_lock_type_t dblt, const void *tag) { - dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; + blkptr_t bp, *bpp = NULL; - err = zio_flags = 0; - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_buf == NULL); ASSERT(db->db_parent == NULL || RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { - err = dbuf_read_bonus(db, dn, flags); + err = dbuf_read_bonus(db, dn); goto early_unlock; } - err = dbuf_read_hole(db, dn, flags); + /* + * If we have a pending block clone, we don't want to read the + * underlying block, but the content of the block being cloned, + * pointed by the dirty record, so we have the most recent data. + * If there is no dirty record, then we hit a race in a sync + * process when the dirty record is already removed, while the + * dbuf is not yet destroyed. Such case is equivalent to uncached. + */ + if (db->db_state == DB_NOFILL) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr != NULL) { + if (!dr->dt.dl.dr_brtwrite) { + err = EIO; + goto early_unlock; + } + bp = dr->dt.dl.dr_overridden_by; + bpp = &bp; + } + } + + if (bpp == NULL && db->db_blkptr != NULL) { + bp = *db->db_blkptr; + bpp = &bp; + } + + err = dbuf_read_hole(db, dn, bpp); if (err == 0) goto early_unlock; + ASSERT(bpp != NULL); + /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ - if (BP_IS_REDACTED(db->db_blkptr)) { + if (BP_IS_REDACTED(bpp)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); @@ -1572,25 +1640,20 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { - spa_log_error(db->db_objset->os_spa, &zb); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(db->db_objset)); + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { + spa_log_error(db->db_objset->os_spa, &zb, + BP_GET_LOGICAL_BIRTH(bpp)); err = SET_ERROR(EIO); goto early_unlock; } - err = dbuf_read_verify_dnode_crypt(db, flags); - if (err != 0) - goto early_unlock; - - DB_DNODE_EXIT(db); - db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); - if (dbuf_is_l2cacheable(db)) + if (!DBUF_IS_CACHEABLE(db)) + aflags |= ARC_FLAG_UNCACHED; + else if (dbuf_is_l2cacheable(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -1601,20 +1664,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; /* - * The zio layer will copy the provided blkptr later, but we need to - * do this now so that we can release the parent's rwlock. We have to - * do that now so that if dbuf_read_done is called synchronously (on + * The zio layer will copy the provided blkptr later, but we have our + * own copy so that we can release the parent's rwlock. We have to + * do that so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ - blkptr_t bp = *db->db_blkptr; dmu_buf_unlock_parent(db, dblt, tag); - (void) arc_read(zio, db->db_objset->os_spa, &bp, + return (arc_read(zio, db->db_objset->os_spa, bpp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, - &aflags, &zb); - return (err); + &aflags, &zb)); + early_unlock: - DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dmu_buf_unlock_parent(db, dblt, tag); return (err); @@ -1661,7 +1722,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); - bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); + memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { dnode_t *dn = DB_DNODE(db); int size = arc_buf_size(db->db_buf); @@ -1691,7 +1752,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } else { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); } else { db->db_buf = NULL; dbuf_clear_data(db); @@ -1699,38 +1760,65 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } int -dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) { - int err = 0; - boolean_t prefetch; dnode_t *dn; + boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch; + int err; - /* - * We don't have to hold the mutex to check db_state because it - * can't be freed while we have a hold on the buffer. - */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - if (db->db_state == DB_NOFILL) - return (SET_ERROR(EIO)); - DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* + * Ensure that this block's dnode has been decrypted if the caller + * has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, dn, flags); + if (err != 0) + goto done; + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && - DBUF_IS_CACHEABLE(db); + (flags & DB_RF_NOPREFETCH) == 0; mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { - spa_t *spa = dn->dn_objset->os_spa; + if (flags & DB_RF_PARTIAL_FIRST) + db->db_partial_read = B_TRUE; + else if (!(flags & DB_RF_PARTIAL_MORE)) + db->db_partial_read = B_FALSE; + miss = (db->db_state != DB_CACHED); + if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* - * Ensure that this block's dnode has been decrypted if - * the caller has requested decrypted data. + * Another reader came in while the dbuf was in flight between + * UNCACHED and CACHED. Either a writer will finish filling + * the buffer, sending the dbuf to CACHED, or the first reader's + * request will reach the read_done callback and send the dbuf + * to CACHED. Otherwise, a failure occurred and the dbuf will + * be sent to UNCACHED. */ - err = dbuf_read_verify_dnode_crypt(db, flags); + if (flags & DB_RF_NEVERWAIT) { + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + do { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, + zio_t *, pio); + cv_wait(&db->db_changed, &db->db_mtx); + } while (db->db_state == DB_READ || db->db_state == DB_FILL); + if (db->db_state == DB_UNCACHED) { + err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); + DB_DNODE_EXIT(db); + goto done; + } + } + if (db->db_state == DB_CACHED) { /* * If the arc buf is compressed or encrypted and the caller * requested uncompressed data, we need to untransform it @@ -1738,11 +1826,11 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) * unauthenticated blocks, which will verify their MAC if * the key is now available. */ - if (err == 0 && db->db_buf != NULL && - (flags & DB_RF_NO_DECRYPT) == 0 && + if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + spa_t *spa = dn->dn_objset->os_spa; zbookmark_phys_t zb; SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), @@ -1752,80 +1840,49 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_FALSE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_hits); - } else if (db->db_state == DB_UNCACHED) { - spa_t *spa = dn->dn_objset->os_spa; - boolean_t need_wait = B_FALSE; - + } else { + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); - - if (zio == NULL && - db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + if (pio == NULL && (db->db_state == DB_NOFILL || + (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { + spa_t *spa = dn->dn_objset->os_spa; + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags, dblt, FTAG); - /* - * dbuf_read_impl has dropped db_mtx and our parent's rwlock - * for us - */ - if (!err && prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - db->db_state != DB_CACHED, - flags & DB_RF_HAVESTRUCT); - } + err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); + /* dbuf_read_impl drops db_mtx and parent's rwlock. */ + miss = (db->db_state != DB_CACHED); + } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); - /* - * If we created a zio_root we must execute it to avoid - * leaking it, even if it isn't attached to any work due - * to an error in dbuf_read_impl(). - */ - if (need_wait) { - if (err == 0) - err = zio_wait(zio); - else - VERIFY0(zio_wait(zio)); - } - } else { - /* - * Another reader came in while the dbuf was in flight - * between UNCACHED and CACHED. Either a writer will finish - * writing the buffer (sending the dbuf to CACHED) or the - * first reader's request will reach the read_done callback - * and send the dbuf to CACHED. Otherwise, a failure - * occurred and the dbuf went to UNCACHED. - */ - mutex_exit(&db->db_mtx); - if (prefetch) { - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - B_TRUE, flags & DB_RF_HAVESTRUCT); - } - DB_DNODE_EXIT(db); - DBUF_STAT_BUMP(hash_misses); + /* + * If we created a zio we must execute it to avoid leaking it, even if + * it isn't attached to any work due to an error in dbuf_read_impl(). + */ + if (need_wait) { + if (err == 0) + err = zio_wait(pio); + else + (void) zio_wait(pio); + pio = NULL; + } - /* Skip the wait per the caller's request. */ - if ((flags & DB_RF_NEVERWAIT) == 0) { - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || - db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, - db, zio_t *, zio); - cv_wait(&db->db_changed, &db->db_mtx); - } - if (db->db_state == DB_UNCACHED) - err = SET_ERROR(EIO); - mutex_exit(&db->db_mtx); - } +done: + if (miss) + DBUF_STAT_BUMP(hash_misses); + else + DBUF_STAT_BUMP(hash_hits); + if (pio && err != 0) { + zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, + ZIO_FLAG_CANFAIL); + zio->io_error = err; + zio_nowait(zio); } return (err); @@ -1879,8 +1936,13 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); + if (dr->dt.dl.dr_brtwrite) { + ASSERT0P(dr->dt.dl.dr_data); + dr->dt.dl.dr_data = db->db_buf; + } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_brtwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* @@ -1891,7 +1953,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ - arc_release(dr->dt.dl.dr_data, db); + if (dr->dt.dl.dr_data) + arc_release(dr->dt.dl.dr_data, db); } /* @@ -1989,7 +2052,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); rw_enter(&db->db_rwlock, RW_WRITER); - bzero(db->db.db_data, db->db.db_size); + memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } @@ -1997,8 +2060,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_exit(&db->db_mtx); } - kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); + kmem_free(db_search, sizeof (dmu_buf_impl_t)); } void @@ -2026,10 +2089,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) /* copy old block data to the new block */ old_buf = db->db_buf; - bcopy(old_buf->b_data, buf->b_data, MIN(osize, size)); + memcpy(buf->b_data, old_buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) - bzero((uint8_t *)buf->b_data + osize, size - osize); + memset((uint8_t *)buf->b_data + osize, 0, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); @@ -2110,7 +2173,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) * Otherwise the buffer contents could be inconsistent between the * dbuf and the lightweight dirty record. */ - ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)); + ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid, + NULL)); mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; @@ -2262,7 +2326,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - if (db->db_blkid != DMU_BONUS_BLKID) { + if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) { dmu_objset_willuse_space(os, db->db.db_size, tx); } @@ -2305,8 +2369,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } - if (db->db_blkid != DMU_BONUS_BLKID) + if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) { dr->dr_accounted = db->db.db_size; + } dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; list_insert_before(&db->db_dirty_records, dr_next, dr); @@ -2462,10 +2527,11 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ -static boolean_t +boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; + boolean_t brtwrite; ASSERT(txg != 0); @@ -2490,6 +2556,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); ASSERT(dr->dr_dbuf == db); + brtwrite = dr->dt.dl.dr_brtwrite; + if (brtwrite) { + /* + * We are freeing a block that we cloned in the same + * transaction group. + */ + brt_pending_remove(dmu_objset_spa(db->db_objset), + &dr->dt.dl.dr_overridden_by, tx); + } + dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -2519,7 +2595,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); } - if (db->db_state != DB_NOFILL) { + if (db->db_state != DB_NOFILL && !brtwrite) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); @@ -2534,7 +2610,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + ASSERT(db->db_state == DB_NOFILL || brtwrite || + arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } @@ -2546,30 +2623,40 @@ static void dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + boolean_t undirty = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* - * Quick check for dirtiness. For already dirty blocks, this - * reduces runtime of this function by >90%, and overall performance - * by 50% for some workloads (e.g. file deletion with indirect blocks - * cached). + * Quick check for dirtiness to improve performance for some workloads + * (e.g. file deletion with indirect blocks cached). */ mutex_enter(&db->db_mtx); - - if (db->db_state == DB_CACHED) { - dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { /* - * It's possible that it is already dirty but not cached, + * It's possible that the dbuf is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr != NULL) { - /* This dbuf is already dirty and cached. */ - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return; + if (db->db_level == 0 && + dr->dt.dl.dr_brtwrite) { + /* + * Block cloning: If we are dirtying a cloned + * level 0 block, we cannot simply redirty it, + * because this dr has no associated data. + * We will go through a full undirtying below, + * before dirtying it again. + */ + undirty = B_TRUE; + } else { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } } } mutex_exit(&db->db_mtx); @@ -2578,7 +2665,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); + + /* + * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we + * want to make sure dbuf_read() will read the pending cloned block and + * not the uderlying block that is being replaced. dbuf_undirty() will + * do dbuf_unoverride(), so we will end up with cloned block content, + * without overridden BP. + */ (void) dbuf_read(db, NULL, flags); + if (undirty) { + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + mutex_exit(&db->db_mtx); + } (void) dbuf_dirty(db, tx); } @@ -2602,17 +2702,51 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) } void +dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + /* + * Block cloning: We are going to clone into this block, so undirty + * modifications done to this block so far in this txg. This includes + * writes and clones into this block. + */ + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + VERIFY(!dbuf_undirty(db, tx)); + ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg)); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + dbuf_clear_data(db); + } + + db->db_state = DB_NOFILL; + DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone"); + + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); +} + +void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + mutex_enter(&db->db_mtx); db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); - dmu_buf_will_fill(db_fake, tx); + mutex_exit(&db->db_mtx); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); } void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -2624,6 +2758,25 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + mutex_enter(&db->db_mtx); + if (db->db_state == DB_NOFILL) { + /* + * Block cloning: We will be completely overwriting a block + * cloned in this transaction group, so let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. But if the fill can fail + * we should have a way to return back to the cloned data. + */ + if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + dmu_buf_will_dirty(db_fake, tx); + return; + } + VERIFY(!dbuf_undirty(db, tx)); + db->db_state = DB_UNCACHED; + } + mutex_exit(&db->db_mtx); + dbuf_noread(db); (void) dbuf_dirty(db, tx); } @@ -2659,9 +2812,9 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; - bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN); - bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN); - bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN); + memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN); + memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN); + memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN); } static void @@ -2671,39 +2824,50 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); + ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = dr->dr_txg; + BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); } -/* ARGSUSED */ -void -dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) +boolean_t +dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) { + (void) tx; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - old_state = db->db_state; - db->db_state = DB_CACHED; - if (old_state == DB_FILL) { + if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ - bzero(db->db.db_data, db->db.db_size); + memset(db->db.db_data, 0, db->db.db_size); db->db_freed_in_flight = FALSE; + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done handling freed in flight"); + failed = B_FALSE; + } else if (failed) { + VERIFY(!dbuf_undirty(db, tx)); + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + dbuf_clear_data(db); + DTRACE_SET_STATE(db, "fill failed"); } else { + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done"); } cv_broadcast(&db->db_changed); + } else { + db->db_state = DB_CACHED; + failed = B_FALSE; } mutex_exit(&db->db_mtx); + return (failed); } void @@ -2732,6 +2896,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, dmu_buf_will_not_fill(dbuf, tx); dr = list_head(&db->db_dirty_records); + ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, @@ -2742,7 +2907,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; - dl->dr_overridden_by.blk_birth = dr->dr_txg; + BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); } void @@ -2793,7 +2958,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); if (db->db_state == DB_CACHED && zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { @@ -2806,7 +2972,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ASSERT(!arc_is_encrypted(buf)); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); + memcpy(db->db.db_data, buf->b_data, db->db.db_size); arc_buf_destroy(buf, db); return; } @@ -2830,6 +2996,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; + } else if (db->db_state == DB_NOFILL) { + /* + * We will be completely replacing the cloned block. In case + * it was cloned in this transaction group, let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + VERIFY(!dbuf_undirty(db, tx)); + db->db_state = DB_UNCACHED; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); @@ -2837,7 +3012,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - dmu_buf_fill_done(&db->db, tx); + dmu_buf_fill_done(&db->db, tx, B_FALSE); } void @@ -2873,6 +3048,8 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + ASSERT0(dmu_buf_user_size(&db->db)); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); @@ -2945,9 +3122,6 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); - kmem_cache_free(dbuf_kmem_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); - /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. @@ -2956,6 +3130,9 @@ dbuf_destroy(dmu_buf_impl_t *db) mutex_enter(&parent->db_mtx); dbuf_rele_and_unlock(parent, db, B_TRUE); } + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); } /* @@ -3057,7 +3234,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, - dmu_buf_impl_t *parent, blkptr_t *blkptr) + dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; @@ -3078,6 +3255,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; + db->db_hash = hash; db->db_user = NULL; db->db_user_immediate_evict = FALSE; @@ -3161,6 +3339,7 @@ dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2); if (err == 0) { + ASSERT3P(bp2, !=, NULL); *bp = *bp2; if (dbp != NULL) dbuf_rele(dbp, NULL); @@ -3189,8 +3368,10 @@ typedef struct dbuf_prefetch_arg { static void dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) { - if (dpa->dpa_cb != NULL) - dpa->dpa_cb(dpa->dpa_arg, io_done); + if (dpa->dpa_cb != NULL) { + dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level, + dpa->dpa_zb.zb_blkid, io_done); + } kmem_free(dpa, sizeof (*dpa)); } @@ -3198,11 +3379,13 @@ static void dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { + (void) zio, (void) zb, (void) iobp; dbuf_prefetch_arg_t *dpa = private; - dbuf_prefetch_fini(dpa, B_TRUE); if (abuf != NULL) arc_buf_destroy(abuf, private); + + dbuf_prefetch_fini(dpa, B_TRUE); } /* @@ -3246,6 +3429,7 @@ static void dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { + (void) zb, (void) iobp; dbuf_prefetch_arg_t *dpa = private; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); @@ -3253,7 +3437,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); - return (dbuf_prefetch_fini(dpa, B_TRUE)); + dbuf_prefetch_fini(dpa, B_TRUE); + return; } ASSERT(zio == NULL || zio->io_error == 0); @@ -3286,7 +3471,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { arc_buf_destroy(abuf, private); - return (dbuf_prefetch_fini(dpa, B_TRUE)); + dbuf_prefetch_fini(dpa, B_TRUE); + return; } (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); @@ -3299,12 +3485,14 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - ASSERT(!BP_IS_REDACTED(bp) || + ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode && dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, - SPA_FEATURE_REDACTED_DATASETS)); + SPA_FEATURE_REDACTED_DATASETS))); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { + arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); + return; } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); @@ -3322,7 +3510,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, - bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, + bp, dbuf_prefetch_indirect_done, dpa, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } @@ -3368,7 +3557,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, - level, blkid); + level, blkid, NULL); if (db != NULL) { mutex_exit(&db->db_mtx); /* @@ -3432,8 +3621,9 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, dpa->dpa_cb = cb; dpa->dpa_arg = arg; - /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (dnode_level_is_l2cacheable(&bp, dn, level)) + if (!DNODE_LEVEL_IS_CACHEABLE(dn, level)) + dpa->dpa_aflags |= ARC_FLAG_UNCACHED; + else if (dnode_level_is_l2cacheable(&bp, dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* @@ -3457,7 +3647,8 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, - &bp, dbuf_prefetch_indirect_done, dpa, prio, + &bp, dbuf_prefetch_indirect_done, dpa, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } @@ -3469,7 +3660,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, return (1); no_issue: if (cb != NULL) - cb(arg, B_FALSE); + cb(arg, level, blkid, B_FALSE); return (0); } @@ -3518,7 +3709,7 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) } rw_enter(&db->db_rwlock, RW_WRITER); - bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); + memcpy(db->db.db_data, data->b_data, arc_buf_size(data)); rw_exit(&db->db_rwlock); } @@ -3529,9 +3720,10 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp) + const void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; + uint64_t hv; /* If the pool has been created, verify the tx_sync_lock is not held */ spa_t *spa = dn->dn_objset->os_spa; @@ -3547,7 +3739,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, *dbp = NULL; /* dbuf_find() returns with db_mtx held */ - db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); + db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv); if (db == NULL) { blkptr_t *bp = NULL; @@ -3569,7 +3761,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, } if (err && err != ENOENT) return (err); - db = dbuf_create(dn, level, blkid, parent, bp); + db = dbuf_create(dn, level, blkid, parent, bp, hv); } if (fail_uncached && db->db_state != DB_CACHED) { @@ -3593,8 +3785,10 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; - if (dr->dt.dl.dr_data == db->db_buf) + if (dr->dt.dl.dr_data == db->db_buf) { + ASSERT3P(db->db_buf, !=, NULL); dbuf_hold_copy(dn, db); + } } if (multilist_link_active(&db->db_cache_link)) { @@ -3603,9 +3797,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); + + uint64_t size = db->db.db_size; + uint64_t usize = dmu_buf_user_size(&db->db); (void) zfs_refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, - db->db.db_size, db); + &dbuf_caches[db->db_caching_status].size, size, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, usize, + db->db_user); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); @@ -3613,7 +3812,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], - db->db.db_size); + size + usize); } db->db_caching_status = DB_NO_CACHE; } @@ -3634,13 +3833,13 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, } dmu_buf_impl_t * -dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) +dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * -dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); @@ -3653,7 +3852,8 @@ dbuf_create_bonus(dnode_t *dn) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL, + dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID)); } int @@ -3681,7 +3881,7 @@ dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) #pragma weak dmu_buf_add_ref = dbuf_add_ref void -dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +dbuf_add_ref(dmu_buf_impl_t *db, const void *tag) { int64_t holds = zfs_refcount_add(&db->db_holds, tag); VERIFY3S(holds, >, 1); @@ -3690,7 +3890,7 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag) #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, - void *tag) + const void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; @@ -3699,7 +3899,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, if (blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else - found_db = dbuf_find(os, obj, 0, blkid); + found_db = dbuf_find(os, obj, 0, blkid, NULL); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { @@ -3719,14 +3919,14 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, * dnode's parent dbuf evicting its dnode handles. */ void -dbuf_rele(dmu_buf_impl_t *db, void *tag) +dbuf_rele(dmu_buf_impl_t *db, const void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag, B_FALSE); } void -dmu_buf_rele(dmu_buf_t *db, void *tag) +dmu_buf_rele(dmu_buf_t *db, const void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } @@ -3745,7 +3945,7 @@ dmu_buf_rele(dmu_buf_t *db, void *tag) * */ void -dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) +dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) { int64_t holds; uint64_t size; @@ -3819,59 +4019,41 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); - } else { - boolean_t do_arc_evict = B_FALSE; - blkptr_t bp; - spa_t *spa = dmu_objset_spa(db->db_objset); - - if (!DBUF_IS_CACHEABLE(db) && - db->db_blkptr != NULL && - !BP_IS_HOLE(db->db_blkptr) && - !BP_IS_EMBEDDED(db->db_blkptr)) { - do_arc_evict = B_TRUE; - bp = *db->db_blkptr; - } - - if (!DBUF_IS_CACHEABLE(db) || - db->db_pending_evict) { - dbuf_destroy(db); - } else if (!multilist_link_active(&db->db_cache_link)) { - ASSERT3U(db->db_caching_status, ==, - DB_NO_CACHE); - - dbuf_cached_state_t dcs = - dbuf_include_in_metadata_cache(db) ? - DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; - db->db_caching_status = dcs; - - multilist_insert(&dbuf_caches[dcs].cache, db); - uint64_t db_size = db->db.db_size; - size = zfs_refcount_add_many( - &dbuf_caches[dcs].size, db_size, db); - uint8_t db_level = db->db_level; - mutex_exit(&db->db_mtx); - - if (dcs == DB_DBUF_METADATA_CACHE) { - DBUF_STAT_BUMP(metadata_cache_count); - DBUF_STAT_MAX( - metadata_cache_size_bytes_max, - size); - } else { - DBUF_STAT_BUMP(cache_count); - DBUF_STAT_MAX(cache_size_bytes_max, - size); - DBUF_STAT_BUMP(cache_levels[db_level]); - DBUF_STAT_INCR( - cache_levels_bytes[db_level], - db_size); - } + } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + + dbuf_cached_state_t dcs = + dbuf_include_in_metadata_cache(db) ? + DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; + db->db_caching_status = dcs; + + multilist_insert(&dbuf_caches[dcs].cache, db); + uint64_t db_size = db->db.db_size; + uint64_t dbu_size = dmu_buf_user_size(&db->db); + (void) zfs_refcount_add_many( + &dbuf_caches[dcs].size, db_size, db); + size = zfs_refcount_add_many( + &dbuf_caches[dcs].size, dbu_size, db->db_user); + uint8_t db_level = db->db_level; + mutex_exit(&db->db_mtx); - if (dcs == DB_DBUF_CACHE && !evicting) - dbuf_evict_notify(size); + if (dcs == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMP(metadata_cache_count); + DBUF_STAT_MAX(metadata_cache_size_bytes_max, + size); + } else { + DBUF_STAT_BUMP(cache_count); + DBUF_STAT_MAX(cache_size_bytes_max, size); + DBUF_STAT_BUMP(cache_levels[db_level]); + DBUF_STAT_INCR(cache_levels_bytes[db_level], + db_size + dbu_size); } - if (do_arc_evict) - arc_freed(spa, &bp); + if (dcs == DB_DBUF_CACHE && !evicting) + dbuf_evict_notify(size); } } else { mutex_exit(&db->db_mtx); @@ -3948,8 +4130,37 @@ dmu_buf_get_user(dmu_buf_t *db_fake) return (db->db_user); } +uint64_t +dmu_buf_user_size(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + if (db->db_user == NULL) + return (0); + return (atomic_load_64(&db->db_user->dbu_size)); +} + +void +dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd); + atomic_add_64(&db->db_user->dbu_size, nadd); +} + void -dmu_buf_user_evict_wait() +dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT3P(db->db_user, !=, NULL); + ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub); + atomic_sub_64(&db->db_user->dbu_size, nsub); +} + +void +dmu_buf_user_evict_wait(void) { taskq_wait(dbu_evict_taskq); } @@ -3968,21 +4179,6 @@ dmu_buf_get_objset(dmu_buf_t *db) return (dbi->db_objset); } -dnode_t * -dmu_buf_dnode_enter(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_ENTER(dbi); - return (DB_DNODE(dbi)); -} - -void -dmu_buf_dnode_exit(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_EXIT(dbi); -} - static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -4042,7 +4238,7 @@ dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dnode_t *dn = dr->dr_dnode; ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); - bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); + memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys)); dbuf_sync_leaf_verify_bonus_dnode(dr); @@ -4244,22 +4440,6 @@ dbuf_lightweight_ready(zio_t *zio) } static void -dbuf_lightweight_physdone(zio_t *zio) -{ - dbuf_dirty_record_t *dr = zio->io_private; - dsl_pool_t *dp = spa_get_dsl(zio->io_spa); - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dbuf_lightweight_done(). - */ - int delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - -static void dbuf_lightweight_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; @@ -4277,16 +4457,8 @@ dbuf_lightweight_done(zio_t *zio) dsl_dataset_block_born(ds, zio->io_bp, tx); } - /* - * See comment in dbuf_write_done(). - */ - if (zio->io_phys_children == 0) { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted, zio->io_txg); - } else { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted % zio->io_phys_children, zio->io_txg); - } + dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, + zio->io_txg); abd_free(dr->dt.dll.dr_abd); kmem_free(dr, sizeof (*dr)); @@ -4320,8 +4492,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, - dbuf_lightweight_physdone, dbuf_lightweight_done, dr, - ZIO_PRIORITY_ASYNC_WRITE, + dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); zio_nowait(dr->dr_zio); @@ -4356,6 +4527,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else if (db->db_state == DB_READ) { + /* + * This buffer has a clone we need to write, and an in-flight + * read on the BP we're about to clone. Its safe to issue the + * write here because the read has already been issued and the + * contents won't change. + */ + ASSERT(dr->dt.dl.dr_brtwrite && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } @@ -4412,7 +4592,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } /* @@ -4422,11 +4601,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) dbuf_prepare_encrypted_dnode_leaf(dr); - if (db->db_state != DB_NOFILL && + if (*datap != NULL && *datap == db->db_buf && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { + dr->dt.dl.dr_override_state != DR_OVERRIDDEN) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), @@ -4462,7 +4640,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } else { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } - bcopy(db->db.db_data, (*datap)->b_data, psize); + memcpy((*datap)->b_data, db->db.db_data, psize); } db->db_data_pending = dr; @@ -4478,6 +4656,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } } +/* + * Syncs out a range of dirty records for indirect or leaf dbufs. May be + * called recursively from dbuf_sync_indirect(). + */ void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { @@ -4512,10 +4694,10 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) } } -/* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { + (void) buf; dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; @@ -4534,7 +4716,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (bp->blk_birth != 0) { + if (BP_GET_LOGICAL_BIRTH(bp) != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && @@ -4571,6 +4753,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; + for (int j = 0; j < dnp->dn_nblkptr; + j++) { + (void) zfs_blkptr_verify(spa, + &dnp->dn_blkptr[j], + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } + if (dnp->dn_flags & + DNODE_FLAG_SPILL_BLKPTR) { + (void) zfs_blkptr_verify(spa, + DN_SPILL_BLKPTR(dnp), + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } @@ -4588,6 +4784,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; + (void) zfs_blkptr_verify(spa, ibp, + BLK_CONFIG_SKIP, BLK_VERIFY_HALT); fill += BP_GET_FILL(ibp); } } @@ -4603,7 +4801,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dmu_buf_unlock_parent(db, dblt, FTAG); } -/* ARGSUSED */ /* * This function gets called just prior to running through the compression * stage of the zio pipeline. If we're an indirect block comprised of only @@ -4614,6 +4811,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) static void dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { + (void) zio, (void) buf; dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp; @@ -4642,47 +4840,16 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) * zero out. */ rw_enter(&db->db_rwlock, RW_WRITER); - bzero(db->db.db_data, db->db.db_size); + memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } -/* - * The SPA will call this callback several times for each zio - once - * for every physical child i/o (zio->io_phys_children times). This - * allows the DMU to monitor the progress of each logical i/o. For example, - * there may be 2 copies of an indirect block, or many fragments of a RAID-Z - * block. There may be a long delay before all copies/fragments are completed, - * so this callback allows us to retire dirty space gradually, as the physical - * i/os complete. - */ -/* ARGSUSED */ -static void -dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) -{ - dmu_buf_impl_t *db = arg; - objset_t *os = db->db_objset; - dsl_pool_t *dp = dmu_objset_pool(os); - dbuf_dirty_record_t *dr; - int delta = 0; - - dr = db->db_data_pending; - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dbuf_write_done(). - */ - delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - -/* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { + (void) buf; dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; @@ -4726,9 +4893,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != db->db_buf) - arc_buf_destroy(dr->dt.dl.dr_data, db); + if (dr->dt.dl.dr_data != NULL && + dr->dt.dl.dr_data != db->db_buf) { + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -4751,27 +4918,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); - /* - * If we didn't do a physical write in this ZIO and we - * still ended up here, it means that the space of the - * dbuf that we just released (and undirtied) above hasn't - * been marked as undirtied in the pool's accounting. - * - * Thus, we undirty that space in the pool's view of the - * world here. For physical writes this type of update - * happens in dbuf_write_physdone(). - * - * If we did a physical write, cleanup any rounding errors - * that came up due to writing multiple copies of a block - * on disk [see dbuf_write_physdone()]. - */ - if (zio->io_phys_children == 0) { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted, zio->io_txg); - } else { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted % zio->io_phys_children, zio->io_txg); - } + dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, + zio->io_txg); kmem_free(dr, sizeof (dbuf_dirty_record_t)); } @@ -4853,7 +5001,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; - drica.drica_blk_birth = bp->blk_birth; + drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp); drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { @@ -4868,7 +5016,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - bp->blk_birth > ds->ds_dir->dd_origin_txg) { + BP_GET_LOGICAL_BIRTH(bp) > + ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -4928,7 +5077,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } -/* Issue I/O to commit a dirty buffer to disk. */ +/* + * Populate dr->dr_zio with a zio to commit a dirty buffer to disk. + * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio). + */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { @@ -4946,21 +5098,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) os = dn->dn_objset; - if (db->db_state != DB_NOFILL) { - if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - if (BP_IS_HOLE(db->db_blkptr)) { - arc_buf_thaw(data); - } else { - dbuf_release_bp(db); - } - dbuf_remap(dn, db, tx); - } + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather than in + * dbuf_dirty() since they are only modified in the syncing + * context and we don't want the overhead of making multiple + * copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) + arc_buf_thaw(data); + else + dbuf_release_bp(db); + dbuf_remap(dn, db, tx); } if (parent != dn->dn_dbuf) { @@ -4987,7 +5136,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? @@ -4996,7 +5145,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; - wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + wp_flag |= (data == NULL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); @@ -5019,20 +5168,21 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, - dbuf_write_override_ready, NULL, NULL, + dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, + dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); - } else if (db->db_state == DB_NOFILL) { + } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, - dbuf_write_nofill_ready, NULL, NULL, + dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); @@ -5049,11 +5199,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, - &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db), - &zp, dbuf_write_ready, - children_ready_cb, dbuf_write_physdone, - dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED, &zb); + &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), + dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, + children_ready_cb, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } @@ -5071,6 +5220,7 @@ EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); +EXPORT_SYMBOL(dmu_buf_will_clone); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); @@ -5093,25 +5243,23 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_get_blkptr); -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW, "Maximum size in bytes of the dbuf cache."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, - "Percentage over dbuf_cache_max_bytes when dbufs must be evicted " - "directly."); + "Percentage over dbuf_cache_max_bytes for direct dbuf eviction."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW, - "Percentage below dbuf_cache_max_bytes when the evict thread stops " - "evicting dbufs."); + "Percentage below dbuf_cache_max_bytes when dbuf eviction stops."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW, + "Maximum size in bytes of dbuf metadata cache."); -ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW, - "Maximum size in bytes of the dbuf metadata cache."); +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW, + "Set size of dbuf cache to log2 fraction of arc size."); -ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW, - "Set the size of the dbuf cache to a log2 fraction of arc size."); +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW, + "Set size of dbuf metadata cache to log2 fraction of arc size."); -ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW, - "Set the size of the dbuf metadata cache to a log2 fraction of arc " - "size."); -/* END CSTYLED */ +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, + "Set size of dbuf cache mutex array as log2 shift."); diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c index 12bb568a08cc..ccee8997e10e 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf_stats.c +++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -46,14 +46,14 @@ static int dbuf_stats_hash_table_headers(char *buf, size_t size) { (void) snprintf(buf, size, - "%-96s | %-119s | %s\n" - "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | " + "%-105s | %-119s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | " "%-5s %-5s %-9s %-6s %-8s %-12s " "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | " "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n", "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", - "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc", - "list", "atype", "flags", "count", "asize", "access", + "blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds", + "dbc", "list", "atype", "flags", "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"); @@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) __dmu_object_info_from_dnode(dn, &doi); nwritten = snprintf(buf, size, - "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d " - "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " + "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu " + "%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | " "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n", /* dmu_buf_impl_t */ @@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) (longlong_t)db->db_blkid, (u_longlong_t)db->db.db_offset, (u_longlong_t)db->db.db_size, + (u_longlong_t)dmu_buf_user_size(&db->db), !!dbuf_is_metadata(db), db->db_state, (ulong_t)zfs_refcount_count(&db->db_holds), @@ -226,7 +227,5 @@ dbuf_stats_destroy(void) dbuf_stats_hash_table_destroy(); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW, "Calculate arc header index"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c index fe5a188f4da1..4c53cb0a2f9b 100644 --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,6 +22,8 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2022 by Pawel Jakub Dawidek + * Copyright (c) 2023, Klara Inc. */ #include <sys/zfs_context.h> @@ -29,15 +31,119 @@ #include <sys/spa_impl.h> #include <sys/zio.h> #include <sys/ddt.h> +#include <sys/ddt_impl.h> #include <sys/zap.h> #include <sys/dmu_tx.h> #include <sys/arc.h> #include <sys/dsl_pool.h> #include <sys/zio_checksum.h> -#include <sys/zio_compress.h> #include <sys/dsl_scan.h> #include <sys/abd.h> +/* + * # DDT: Deduplication tables + * + * The dedup subsystem provides block-level deduplication. When enabled, blocks + * to be written will have the dedup (D) bit set, which causes them to be + * tracked in a "dedup table", or DDT. If a block has been seen before (exists + * in the DDT), instead of being written, it will instead be made to reference + * the existing on-disk data, and a refcount bumped in the DDT instead. + * + * ## Dedup tables and entries + * + * Conceptually, a DDT is a dictionary or map. Each entry has a "key" + * (ddt_key_t) made up a block's checksum and certian properties, and a "value" + * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth + * time and refcount. Together these are enough to track references to a + * specific block, to build a valid block pointer to reference that block (for + * freeing, scrubbing, etc), and to fill a new block pointer with the missing + * pieces to make it seem like it was written. + * + * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[]. + * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk + * object data formats, each with their own implementations) and "classes" + * (ddt_class_t, instance of a storage type object, for entries with a specific + * characteristic). An entry (key) will only ever exist on one of these objects + * at any given time, but may be moved from one to another if their type or + * class changes. + * + * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block + * is to be written, before DVAs have been allocated, ddt_lookup() is called to + * see if the block has been seen before. If its not found, the write proceeds + * as normal, and after it succeeds, a new entry is created. If it is found, we + * fill the BP with the DVAs from the entry, increment the refcount and cause + * the write IO to return immediately. + * + * Each ddt_phys_t slot in the entry represents a separate dedup block for the + * same content/checksum. The slot is selected based on the zp_copies parameter + * the block is written with, that is, the number of DVAs in the block. The + * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" + * feature. These are no longer written, and will be freed if encountered on + * old pools. + * + * ## Lifetime of an entry + * + * A DDT can be enormous, and typically is not held in memory all at once. + * Instead, the changes to an entry are tracked in memory, and written down to + * disk at the end of each txg. + * + * A "live" in-memory entry (ddt_entry_t) is a node on the live tree + * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is + * required for IO, ddt_lookup() is called. If an entry already exists on + * ddt_tree, it is returned. Otherwise, a new one is created, and the + * type/class objects for the DDT are searched for that key. If its found, its + * value is copied into the live entry. If not, an empty entry is created. + * + * The live entry will be modified during the txg, usually by modifying the + * refcount, but sometimes by adding or updating DVAs. At the end of the txg + * (during spa_sync()), type and class are recalculated for entry (see + * ddt_sync_entry()), and the entry is written to the appropriate storage + * object and (if necessary), removed from an old one. ddt_tree is cleared and + * the next txg can start. + * + * ## Repair IO + * + * If a read on a dedup block fails, but there are other copies of the block in + * the other ddt_phys_t slots, reads will be issued for those instead + * (zio_ddt_read_start()). If one of those succeeds, the read is returned to + * the caller, and a copy is stashed on the entry's dde_repair_abd. + * + * During the end-of-txg sync, any entries with a dde_repair_abd get a + * "rewrite" write issued for the original block pointer, with the data read + * from the alternate block. If the block is actually damaged, this will invoke + * the pool's "self-healing" mechanism, and repair the block. + * + * ## Scanning (scrub/resilver) + * + * If dedup is active, the scrub machinery will walk the dedup table first, and + * scrub all blocks with refcnt > 1 first. After that it will move on to the + * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them. + * In this way, heavily deduplicated blocks are only scrubbed once. See the + * commentary on dsl_scan_ddt() for more details. + * + * Walking the DDT is done via ddt_walk(). The current position is stored in a + * ddt_bookmark_t, which represents a stable position in the storage object. + * This bookmark is stored by the scan machinery, and must reference the same + * position on the object even if the object changes, the pool is exported, or + * OpenZFS is upgraded. + * + * ## Interaction with block cloning + * + * If block cloning and dedup are both enabled on a pool, BRT will look for the + * dedup bit on an incoming block pointer. If set, it will call into the DDT + * (ddt_addref()) to add a reference to the block, instead of adding a + * reference to the BRT. See brt_pending_apply(). + */ + +/* + * These are the only checksums valid for dedup. They must match the list + * from dedup_table in zfs_prop.c + */ +#define DDT_CHECKSUM_VALID(c) \ + (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \ + c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \ + c == ZIO_CHECKSUM_BLAKE3) + static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_entry_cache; @@ -46,18 +152,18 @@ static kmem_cache_t *ddt_entry_cache; */ int zfs_dedup_prefetch = 0; -static const ddt_ops_t *ddt_ops[DDT_TYPES] = { +static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; -static const char *ddt_class_name[DDT_CLASSES] = { +static const char *const ddt_class_name[DDT_CLASSES] = { "ditto", "duplicate", "unique", }; static void -ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; @@ -69,20 +175,20 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_object_name(ddt, type, class, name); - ASSERT(*objectp == 0); - VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); - ASSERT(*objectp != 0); + ASSERT3U(*objectp, ==, 0); + VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); + ASSERT3U(*objectp, !=, 0); - VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx) == 0); + VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx)); - VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); + &ddt->ddt_histogram[type][class], tx)); } static void -ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; @@ -93,19 +199,20 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_object_name(ddt, type, class, name); - ASSERT(*objectp != 0); + ASSERT3U(*objectp, !=, 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); - VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); - VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); - VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); - bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); + VERIFY0(ddt_object_count(ddt, type, class, &count)); + VERIFY0(count); + VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); + VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); + memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); *objectp = 0; } static int -ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; dmu_object_info_t doi; @@ -145,7 +252,7 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) } static void -ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; @@ -155,75 +262,95 @@ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_object_name(ddt, type, class, name); - VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); + &ddt->ddt_histogram[type][class], tx)); /* * Cache DDT statistics; this is the only time they'll change. */ - VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0); + VERIFY0(ddt_object_info(ddt, type, class, &doi)); + VERIFY0(ddt_object_count(ddt, type, class, &count)); ddo->ddo_count = count; ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; } +static boolean_t +ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class) +{ + return (!!ddt->ddt_object[type][class]); +} + static int -ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde) { if (!ddt_object_exists(ddt, type, class)) return (SET_ERROR(ENOENT)); return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, - ddt->ddt_object[type][class], dde)); + ddt->ddt_object[type][class], &dde->dde_key, + dde->dde_phys, sizeof (dde->dde_phys))); +} + +static int +ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class, + const ddt_key_t *ddk) +{ + if (!ddt_object_exists(ddt, type, class)) + return (SET_ERROR(ENOENT)); + + return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os, + ddt->ddt_object[type][class], ddk)); } static void -ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde) +ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, + const ddt_key_t *ddk) { if (!ddt_object_exists(ddt, type, class)) return; ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, - ddt->ddt_object[type][class], dde); + ddt->ddt_object[type][class], ddk); } -int -ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +static int +ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); + ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, + sizeof (dde->dde_phys), tx)); } static int -ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - ddt_entry_t *dde, dmu_tx_t *tx) +ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, + const ddt_key_t *ddk, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); + ddt->ddt_object[type][class], ddk, tx)); } int -ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t *walk, ddt_entry_t *dde) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, - ddt->ddt_object[type][class], dde, walk)); + ddt->ddt_object[type][class], walk, &dde->dde_key, + dde->dde_phys, sizeof (dde->dde_phys))); } int -ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t *count) { ASSERT(ddt_object_exists(ddt, type, class)); @@ -233,7 +360,7 @@ ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } int -ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_object_info_t *doi) { if (!ddt_object_exists(ddt, type, class)) @@ -243,14 +370,8 @@ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, doi)); } -boolean_t -ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) -{ - return (!!ddt->ddt_object[type][class]); -} - void -ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, char *name) { (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, @@ -261,7 +382,7 @@ ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) { - ASSERT(txg != 0); + ASSERT3U(txg, !=, 0); for (int d = 0; d < SPA_DVAS_PER_BP; d++) bp->blk_dva[d] = ddp->ddp_dva[d]; @@ -312,17 +433,17 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) { - ASSERT(ddp->ddp_phys_birth == 0); + ASSERT0(ddp->ddp_phys_birth); for (int d = 0; d < SPA_DVAS_PER_BP; d++) ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); + ddp->ddp_phys_birth = BP_GET_BIRTH(bp); } void ddt_phys_clear(ddt_phys_t *ddp) { - bzero(ddp, sizeof (*ddp)); + memset(ddp, 0, sizeof (*ddp)); } void @@ -335,12 +456,12 @@ void ddt_phys_decref(ddt_phys_t *ddp) { if (ddp) { - ASSERT(ddp->ddp_refcnt > 0); + ASSERT3U(ddp->ddp_refcnt, >, 0); ddp->ddp_refcnt--; } } -void +static void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) { blkptr_t blk; @@ -364,7 +485,7 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && - BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) return (ddp); } return (NULL); @@ -381,221 +502,10 @@ ddt_phys_total_refcnt(const ddt_entry_t *dde) return (refcnt); } -static void -ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) -{ - spa_t *spa = ddt->ddt_spa; - ddt_phys_t *ddp = dde->dde_phys; - ddt_key_t *ddk = &dde->dde_key; - uint64_t lsize = DDK_GET_LSIZE(ddk); - uint64_t psize = DDK_GET_PSIZE(ddk); - - bzero(dds, sizeof (*dds)); - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - uint64_t dsize = 0; - uint64_t refcnt = ddp->ddp_refcnt; - - if (ddp->ddp_phys_birth == 0) - continue; - - for (int d = 0; d < DDE_GET_NDVAS(dde); d++) - dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); - - dds->dds_blocks += 1; - dds->dds_lsize += lsize; - dds->dds_psize += psize; - dds->dds_dsize += dsize; - - dds->dds_ref_blocks += refcnt; - dds->dds_ref_lsize += lsize * refcnt; - dds->dds_ref_psize += psize * refcnt; - dds->dds_ref_dsize += dsize * refcnt; - } -} - -void -ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) -{ - const uint64_t *s = (const uint64_t *)src; - uint64_t *d = (uint64_t *)dst; - uint64_t *d_end = (uint64_t *)(dst + 1); - - ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ - - for (int i = 0; i < d_end - d; i++) - d[i] += (s[i] ^ neg) - neg; -} - -static void -ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) -{ - ddt_stat_t dds; - ddt_histogram_t *ddh; - int bucket; - - ddt_stat_generate(ddt, dde, &dds); - - bucket = highbit64(dds.dds_ref_blocks) - 1; - ASSERT(bucket >= 0); - - ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; - - ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); -} - -void -ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) -{ - for (int h = 0; h < 64; h++) - ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); -} - -void -ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) -{ - bzero(dds, sizeof (*dds)); - - for (int h = 0; h < 64; h++) - ddt_stat_add(dds, &ddh->ddh_stat[h], 0); -} - -boolean_t -ddt_histogram_empty(const ddt_histogram_t *ddh) -{ - const uint64_t *s = (const uint64_t *)ddh; - const uint64_t *s_end = (const uint64_t *)(ddh + 1); - - while (s < s_end) - if (*s++ != 0) - return (B_FALSE); - - return (B_TRUE); -} - -void -ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) -{ - /* Sum the statistics we cached in ddt_object_sync(). */ - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - ddt_object_t *ddo = - &ddt->ddt_object_stats[type][class]; - ddo_total->ddo_count += ddo->ddo_count; - ddo_total->ddo_dspace += ddo->ddo_dspace; - ddo_total->ddo_mspace += ddo->ddo_mspace; - } - } - } - - /* ... and compute the averages. */ - if (ddo_total->ddo_count != 0) { - ddo_total->ddo_dspace /= ddo_total->ddo_count; - ddo_total->ddo_mspace /= ddo_total->ddo_count; - } -} - -void -ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) -{ - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - ddt_histogram_add(ddh, - &ddt->ddt_histogram_cache[type][class]); - } - } - } -} - -void -ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) -{ - ddt_histogram_t *ddh_total; - - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); - ddt_get_dedup_histogram(spa, ddh_total); - ddt_histogram_stat(dds_total, ddh_total); - kmem_free(ddh_total, sizeof (ddt_histogram_t)); -} - -uint64_t -ddt_get_dedup_dspace(spa_t *spa) -{ - ddt_stat_t dds_total; - - if (spa->spa_dedup_dspace != ~0ULL) - return (spa->spa_dedup_dspace); - - bzero(&dds_total, sizeof (ddt_stat_t)); - - /* Calculate and cache the stats */ - ddt_get_dedup_stats(spa, &dds_total); - spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize; - return (spa->spa_dedup_dspace); -} - -uint64_t -ddt_get_pool_dedup_ratio(spa_t *spa) -{ - ddt_stat_t dds_total = { 0 }; - - ddt_get_dedup_stats(spa, &dds_total); - if (dds_total.dds_dsize == 0) - return (100); - - return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); -} - -size_t -ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) -{ - uchar_t *version = dst++; - int cpfunc = ZIO_COMPRESS_ZLE; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - size_t c_len; - - ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ - - c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); - - if (c_len == s_len) { - cpfunc = ZIO_COMPRESS_OFF; - bcopy(src, dst, s_len); - } - - *version = cpfunc; - if (ZFS_HOST_BYTEORDER) - *version |= DDT_COMPRESS_BYTEORDER_MASK; - - return (c_len + 1); -} - -void -ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) -{ - uchar_t version = *src++; - int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - - if (ci->ci_decompress != NULL) - (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); - else - bcopy(src, dst, d_len); - - if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != - (ZFS_HOST_BYTEORDER != 0)) - byteswap_uint64_array(dst, d_len); -} - ddt_t * ddt_select(spa_t *spa, const blkptr_t *bp) { + ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp))); return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); } @@ -633,7 +543,7 @@ ddt_alloc(const ddt_key_t *ddk) ddt_entry_t *dde; dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - bzero(dde, sizeof (ddt_entry_t)); + memset(dde, 0, sizeof (ddt_entry_t)); cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; @@ -644,10 +554,10 @@ ddt_alloc(const ddt_key_t *ddk) static void ddt_free(ddt_entry_t *dde) { - ASSERT(!dde->dde_loading); + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); for (int p = 0; p < DDT_PHYS_TYPES; p++) - ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT3P(dde->dde_lead_zio[p], ==, NULL); if (dde->dde_repair_abd != NULL) abd_free(dde->dde_repair_abd); @@ -668,36 +578,48 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { - ddt_entry_t *dde, dde_search; - enum ddt_type type; - enum ddt_class class; + ddt_key_t search; + ddt_entry_t *dde; + ddt_type_t type; + ddt_class_t class; avl_index_t where; int error; ASSERT(MUTEX_HELD(&ddt->ddt_lock)); - ddt_key_fill(&dde_search.dde_key, bp); + ddt_key_fill(&search, bp); - dde = avl_find(&ddt->ddt_tree, &dde_search, &where); - if (dde == NULL) { - if (!add) - return (NULL); - dde = ddt_alloc(&dde_search.dde_key); - avl_insert(&ddt->ddt_tree, dde, where); - } + /* Find an existing live entry */ + dde = avl_find(&ddt->ddt_tree, &search, &where); + if (dde != NULL) { + /* Found it. If it's already loaded, we can just return it. */ + if (dde->dde_flags & DDE_FLAG_LOADED) + return (dde); - while (dde->dde_loading) - cv_wait(&dde->dde_cv, &ddt->ddt_lock); + /* Someone else is loading it, wait for it. */ + while (!(dde->dde_flags & DDE_FLAG_LOADED)) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); - if (dde->dde_loaded) return (dde); + } - dde->dde_loading = B_TRUE; + /* Not found. */ + if (!add) + return (NULL); + /* Time to make a new entry. */ + dde = ddt_alloc(&search); + avl_insert(&ddt->ddt_tree, dde, where); + + /* + * ddt_tree is now stable, so unlock and let everyone else keep moving. + * Anyone landing on this entry will find it without DDE_FLAG_LOADED, + * and go to sleep waiting for it above. + */ ddt_exit(ddt); + /* Search all store objects for the entry. */ error = ENOENT; - for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { error = ddt_object_lookup(ddt, type, class, dde); @@ -712,17 +634,16 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ddt_enter(ddt); - ASSERT(dde->dde_loaded == B_FALSE); - ASSERT(dde->dde_loading == B_TRUE); + ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED)); dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ - dde->dde_loaded = B_TRUE; - dde->dde_loading = B_FALSE; if (error == 0) ddt_stat_update(ddt, dde, -1ULL); + /* Entry loaded, everyone can proceed now */ + dde->dde_flags |= DDE_FLAG_LOADED; cv_broadcast(&dde->dde_cv); return (dde); @@ -732,7 +653,7 @@ void ddt_prefetch(spa_t *spa, const blkptr_t *bp) { ddt_t *ddt; - ddt_entry_t dde; + ddt_key_t ddk; if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) return; @@ -743,17 +664,18 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp) * Thus no locking is required as the DDT can't disappear on us. */ ddt = ddt_select(spa, bp); - ddt_key_fill(&dde.dde_key, bp); + ddt_key_fill(&ddk, bp); - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - ddt_object_prefetch(ddt, type, class, &dde); + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &ddk); } } } /* - * Opaque struct used for ddt_key comparison + * Key comparison. Any struct wanting to make use of this function must have + * the key as the first element. */ #define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t)) @@ -762,12 +684,10 @@ typedef struct ddt_key_cmp { } ddt_key_cmp_t; int -ddt_entry_compare(const void *x1, const void *x2) +ddt_key_compare(const void *x1, const void *x2) { - const ddt_entry_t *dde1 = x1; - const ddt_entry_t *dde2 = x2; - const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key; - const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key; + const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1; + const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2; int32_t cmp = 0; for (int i = 0; i < DDT_KEY_CMP_LEN; i++) { @@ -785,12 +705,12 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt_t *ddt; ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); - bzero(ddt, sizeof (ddt_t)); + memset(ddt, 0, sizeof (ddt_t)); mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&ddt->ddt_tree, ddt_entry_compare, + avl_create(&ddt->ddt_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); - avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + avl_create(&ddt->ddt_repair_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); ddt->ddt_checksum = c; ddt->ddt_spa = spa; @@ -802,8 +722,8 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) static void ddt_table_free(ddt_t *ddt) { - ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); - ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + ASSERT0(avl_numnodes(&ddt->ddt_tree)); + ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); avl_destroy(&ddt->ddt_tree); avl_destroy(&ddt->ddt_repair_tree); mutex_destroy(&ddt->ddt_lock); @@ -815,8 +735,10 @@ ddt_create(spa_t *spa) { spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) - spa->spa_ddt[c] = ddt_table_alloc(spa, c); + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (DDT_CHECKSUM_VALID(c)) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); + } } int @@ -834,9 +756,12 @@ ddt_load(spa_t *spa) return (error == ENOENT ? 0 : error); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (!DDT_CHECKSUM_VALID(c)) + continue; + ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { error = ddt_object_load(ddt, type, class); if (error != 0 && error != ENOENT) @@ -847,7 +772,7 @@ ddt_load(spa_t *spa) /* * Seed the cached histograms. */ - bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); spa->spa_dedup_dspace = ~0ULL; } @@ -867,10 +792,10 @@ ddt_unload(spa_t *spa) } boolean_t -ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) { ddt_t *ddt; - ddt_entry_t *dde; + ddt_key_t ddk; if (!BP_GET_DEDUP(bp)) return (B_FALSE); @@ -879,20 +804,16 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) return (B_TRUE); ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - ddt_key_fill(&(dde->dde_key), bp); + ddt_key_fill(&ddk, bp); - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class <= max_class; class++) { - if (ddt_object_lookup(ddt, type, class, dde) == 0) { - kmem_cache_free(ddt_entry_cache, dde); + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class <= max_class; class++) { + if (ddt_object_contains(ddt, type, class, &ddk) == 0) return (B_TRUE); - } } } - kmem_cache_free(ddt_entry_cache, dde); return (B_FALSE); } @@ -906,8 +827,8 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) dde = ddt_alloc(&ddk); - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { /* * We can only do repair if there are multiple copies * of the block. For anything in the UNIQUE class, @@ -919,7 +840,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) } } - bzero(dde->dde_phys, sizeof (dde->dde_phys)); + memset(dde->dde_phys, 0, sizeof (dde->dde_phys)); return (dde); } @@ -964,7 +885,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth != rddp->ddp_phys_birth || - bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, @@ -1006,19 +927,18 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; ddt_phys_t *ddp = dde->dde_phys; ddt_key_t *ddk = &dde->dde_key; - enum ddt_type otype = dde->dde_type; - enum ddt_type ntype = DDT_TYPE_CURRENT; - enum ddt_class oclass = dde->dde_class; - enum ddt_class nclass; + ddt_type_t otype = dde->dde_type; + ddt_type_t ntype = DDT_TYPE_DEFAULT; + ddt_class_t oclass = dde->dde_class; + ddt_class_t nclass; uint64_t total_refcnt = 0; - ASSERT(dde->dde_loaded); - ASSERT(!dde->dde_loading); + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT3P(dde->dde_lead_zio[p], ==, NULL); if (ddp->ddp_phys_birth == 0) { - ASSERT(ddp->ddp_refcnt == 0); + ASSERT0(ddp->ddp_refcnt); continue; } if (p == DDT_PHYS_DITTO) { @@ -1043,8 +963,9 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) if (otype != DDT_TYPES && (otype != ntype || oclass != nclass || total_refcnt == 0)) { - VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); - ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); + ASSERT3U( + ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT); } if (total_refcnt != 0) { @@ -1053,7 +974,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ddt_stat_update(ddt, dde, 0); if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); - VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); /* * If the class changes, the order that we scan this bp @@ -1079,7 +1000,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) if (avl_numnodes(&ddt->ddt_tree) == 0) return; - ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, @@ -1092,23 +1013,23 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ddt_free(dde); } - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { uint64_t add, count = 0; - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); - VERIFY(ddt_object_count(ddt, type, class, - &add) == 0); + VERIFY0(ddt_object_count(ddt, type, class, + &add)); count += add; } } - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (count == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } } - bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, sizeof (ddt->ddt_histogram)); spa->spa_dedup_dspace = ~0ULL; } @@ -1120,7 +1041,7 @@ ddt_sync(spa_t *spa, uint64_t txg) dmu_tx_t *tx; zio_t *rio; - ASSERT(spa_syncing_txg(spa) == txg); + ASSERT3U(spa_syncing_txg(spa), ==, txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1157,6 +1078,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) do { do { ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + if (ddt == NULL) + continue; int error = ENOENT; if (ddt_object_exists(ddt, ddb->ddb_type, ddb->ddb_class)) { @@ -1180,7 +1103,68 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) return (SET_ERROR(ENOENT)); } -/* BEGIN CSTYLED */ +/* + * This function is used by Block Cloning (brt.c) to increase reference + * counter for the DDT entry if the block is already in DDT. + * + * Return false if the block, despite having the D bit set, is not present + * in the DDT. Currently this is not possible but might be in the future. + * See the comment below. + */ +boolean_t +ddt_addref(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t *dde; + boolean_t result; + + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + ddt = ddt_select(spa, bp); + ddt_enter(ddt); + + dde = ddt_lookup(ddt, bp, B_TRUE); + ASSERT3P(dde, !=, NULL); + + if (dde->dde_type < DDT_TYPES) { + ddt_phys_t *ddp; + + ASSERT3S(dde->dde_class, <, DDT_CLASSES); + + ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; + + /* + * This entry already existed (dde_type is real), so it must + * have refcnt >0 at the start of this txg. We are called from + * brt_pending_apply(), before frees are issued, so the refcnt + * can't be lowered yet. Therefore, it must be >0. We assert + * this because if the order of BRT and DDT interactions were + * ever to change and the refcnt was ever zero here, then + * likely further action is required to fill out the DDT entry, + * and this is a place that is likely to be missed in testing. + */ + ASSERT3U(ddp->ddp_refcnt, >, 0); + + ddt_phys_addref(ddp); + result = B_TRUE; + } else { + /* + * At the time of implementating this if the block has the + * DEDUP flag set it must exist in the DEDUP table, but + * there are many advocates that want ability to remove + * entries from DDT with refcnt=1. If this will happen, + * we may have a block with the DEDUP set, but which doesn't + * have a corresponding entry in the DDT. Be ready. + */ + ASSERT3S(dde->dde_class, ==, DDT_CLASSES); + ddt_remove(ddt, dde); + result = B_FALSE; + } + + ddt_exit(ddt); + spa_config_exit(spa, SCL_ZIO, FTAG); + + return (result); +} + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/ddt_stats.c b/sys/contrib/openzfs/module/zfs/ddt_stats.c new file mode 100644 index 000000000000..af5365a1d114 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/ddt_stats.c @@ -0,0 +1,212 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2022 by Pawel Jakub Dawidek + * Copyright (c) 2023, Klara Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/ddt.h> +#include <sys/ddt_impl.h> + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + memset(dds, 0, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? + SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + for (int d = 0; d < ndvas; d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + for (int i = 0; i < d_end - d; i++) + d[i] += (s[i] ^ neg) - neg; +} + +void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit64(dds.dds_ref_blocks) - 1; + ASSERT3U(bucket, >=, 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + memset(dds, 0, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) +{ + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total; + + if (spa->spa_dedup_dspace != ~0ULL) + return (spa->spa_dedup_dspace); + + memset(&dds_total, 0, sizeof (ddt_stat_t)); + + /* Calculate and cache the stats */ + ddt_get_dedup_stats(spa, &dds_total); + spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize; + return (spa->spa_dedup_dspace); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c index c5c9eda0b2d0..741554de3c60 100644 --- a/sys/contrib/openzfs/module/zfs/ddt_zap.c +++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -28,11 +28,60 @@ #include <sys/spa.h> #include <sys/zio.h> #include <sys/ddt.h> +#include <sys/ddt_impl.h> #include <sys/zap.h> #include <sys/dmu_tx.h> +#include <sys/zio_compress.h> -int ddt_zap_leaf_blockshift = 12; -int ddt_zap_indirect_blockshift = 12; +static unsigned int ddt_zap_default_bs = 15; +static unsigned int ddt_zap_default_ibs = 15; + +#define DDT_ZAP_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_ZAP_COMPRESS_FUNCTION_MASK 0x7f + +#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) + +static size_t +ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1, + ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + memcpy(dst, src, s_len); + } + + *version = cpfunc; + if (ZFS_HOST_BYTEORDER) + *version |= DDT_ZAP_COMPRESS_BYTEORDER_MASK; + + return (c_len + 1); +} + +static void +ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + memcpy(dst, src, d_len); + + if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) != + (ZFS_HOST_BYTEORDER != 0)) + byteswap_uint64_array(dst, d_len); +} static int ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) @@ -43,10 +92,12 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) flags |= ZAP_FLAG_PRE_HASHED_KEY; *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, - ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + ddt_zap_default_bs, ddt_zap_default_ibs, DMU_OT_NONE, 0, tx); + if (*objectp == 0) + return (SET_ERROR(ENOTSUP)); - return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0); + return (0); } static int @@ -56,63 +107,75 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) } static int -ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +ddt_zap_lookup(objset_t *os, uint64_t object, + const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) { uchar_t *cbuf; uint64_t one, csize; int error; - cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_SLEEP); - - error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + error = zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, &one, &csize); if (error) - goto out; + return (error); - ASSERT(one == 1); - ASSERT(csize <= (sizeof (dde->dde_phys) + 1)); + ASSERT3U(one, ==, 1); + ASSERT3U(csize, <=, psize + 1); - error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + cbuf = kmem_alloc(csize, KM_SLEEP); + + error = zap_lookup_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, 1, csize, cbuf); - if (error) - goto out; + if (error == 0) + ddt_zap_decompress(cbuf, phys, csize, psize); - ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); -out: - kmem_free(cbuf, sizeof (dde->dde_phys) + 1); + kmem_free(cbuf, csize); return (error); } +static int +ddt_zap_contains(objset_t *os, uint64_t object, const ddt_key_t *ddk) +{ + return (zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, + NULL, NULL)); +} + static void -ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk) { - (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS); + (void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS); } static int -ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, + const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) { - uchar_t cbuf[sizeof (dde->dde_phys) + 1]; - uint64_t csize; + const size_t cbuf_size = psize + 1; + + uchar_t *cbuf = kmem_alloc(cbuf_size, KM_SLEEP); + + uint64_t csize = ddt_zap_compress(phys, cbuf, psize, cbuf_size); - csize = ddt_compress(dde->dde_phys, cbuf, - sizeof (dde->dde_phys), sizeof (cbuf)); + int error = zap_update_uint64(os, object, (uint64_t *)ddk, + DDT_KEY_WORDS, 1, csize, cbuf, tx); - return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS, 1, csize, cbuf, tx)); + kmem_free(cbuf, cbuf_size); + + return (error); } static int -ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk, + dmu_tx_t *tx) { - return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + return (zap_remove_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, tx)); } static int -ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, + ddt_phys_t *phys, size_t psize) { zap_cursor_t zc; zap_attribute_t za; @@ -131,17 +194,23 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) zap_cursor_init_serialized(&zc, os, object, *walk); } if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { - uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize = za.za_num_integers; - ASSERT(za.za_integer_length == 1); + + ASSERT3U(za.za_integer_length, ==, 1); + ASSERT3U(csize, <=, psize + 1); + + uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, DDT_KEY_WORDS, 1, csize, cbuf); - ASSERT(error == 0); + ASSERT0(error); if (error == 0) { - ddt_decompress(cbuf, dde->dde_phys, csize, - sizeof (dde->dde_phys)); - dde->dde_key = *(ddt_key_t *)za.za_name; + ddt_zap_decompress(cbuf, phys, csize, psize); + *ddk = *(ddt_key_t *)za.za_name; } + + kmem_free(cbuf, csize); + zap_cursor_advance(&zc); *walk = zap_cursor_serialize(&zc); } @@ -160,9 +229,17 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_create, ddt_zap_destroy, ddt_zap_lookup, + ddt_zap_contains, ddt_zap_prefetch, ddt_zap_update, ddt_zap_remove, ddt_zap_walk, ddt_zap_count, }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW, + "DDT ZAP leaf blockshift"); +ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW, + "DDT ZAP indirect blockshift"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index eee3e70bbc95..8b440aafba43 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -28,6 +28,8 @@ * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include <sys/dmu.h> @@ -51,6 +53,7 @@ #include <sys/sa.h> #include <sys/zfeature.h> #include <sys/abd.h> +#include <sys/brt.h> #include <sys/trace_zfs.h> #include <sys/zfs_racct.h> #include <sys/zfs_rlock.h> @@ -62,7 +65,7 @@ /* * Enable/disable nopwrite feature. */ -int zfs_nopwrite_enabled = 1; +static int zfs_nopwrite_enabled = 1; /* * Tunable to control percentage of dirtied L1 blocks from frees allowed into @@ -70,19 +73,27 @@ int zfs_nopwrite_enabled = 1; * will wait until the next TXG. * A value of zero will disable this throttle. */ -unsigned long zfs_per_txg_dirty_frees_percent = 5; +static uint_t zfs_per_txg_dirty_frees_percent = 30; /* - * Enable/disable forcing txg sync when dirty in dmu_offset_next. + * Enable/disable forcing txg sync when dirty checking for holes with lseek(). + * By default this is enabled to ensure accurate hole reporting, it can result + * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads. + * Disabling this option will result in holes never being reported in dirty + * files which is always safe. */ -int zfs_dmu_offset_next_sync = 0; +static int zfs_dmu_offset_next_sync = 1; /* * Limit the amount we can prefetch with one call to this amount. This * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ -int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +#ifdef _ILP32 +uint_t dmu_prefetch_max = 8 * 1024 * 1024; +#else +uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +#endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, @@ -141,7 +152,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } }; -const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { +dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, @@ -154,9 +165,9 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { zfs_acl_byteswap, "acl" } }; -static int +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, - void *tag, dmu_buf_t **dbp) + const void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; @@ -174,9 +185,10 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, *dbp = &db->db; return (0); } + int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp) + const void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; @@ -203,7 +215,7 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, - void *tag, dmu_buf_t **dbp, int flags) + const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; @@ -228,7 +240,7 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - void *tag, dmu_buf_t **dbp, int flags) + const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; @@ -338,7 +350,7 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) * has not yet been allocated a new bonus dbuf a will be allocated. * Returns ENOENT, EIO, or 0. */ -int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, +int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, uint32_t flags) { dmu_buf_impl_t *db; @@ -352,8 +364,10 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { - rw_exit(&dn->dn_struct_rwlock); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + } if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } @@ -385,7 +399,7 @@ int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, } int -dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp) { dnode_t *dn; int error; @@ -410,7 +424,8 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) * dmu_spill_hold_existing() should be used. */ int -dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag, + dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; @@ -438,7 +453,7 @@ dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) } int -dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; @@ -467,7 +482,7 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) } int -dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; @@ -494,7 +509,8 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) + boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, + uint32_t flags) { dmu_buf_t **dbp; zstream_t *zs = NULL; @@ -504,7 +520,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zio_t *zio = NULL; boolean_t missed = B_FALSE; - ASSERT(length <= DMU_MAX_ACCESS); + ASSERT(!read || length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that @@ -514,11 +530,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; + if ((flags & DMU_READ_NO_DECRYPT) != 0) + dbuf_flags |= DB_RF_NO_DECRYPT; + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; + P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t)) + >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " @@ -538,21 +558,22 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); - if ((flags & DMU_READ_NO_PREFETCH) == 0 && - DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + if ((flags & DMU_READ_NO_PREFETCH) == 0) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we * base predictions on the original less racy request order. */ - zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn), B_TRUE); + zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read, + B_TRUE); } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { - if (zs) - dmu_zfetch_run(zs, missed, B_TRUE); + if (zs) { + dmu_zfetch_run(&dn->dn_zfetch, zs, missed, + B_TRUE); + } rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) @@ -569,6 +590,14 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, * state will not yet be CACHED. */ if (read) { + if (i == nblks - 1 && blkid + i < dn->dn_maxblkid && + offset + length < db->db.db_offset + + db->db.db_size) { + if (offset <= db->db.db_offset) + dbuf_flags |= DB_RF_PARTIAL_FIRST; + else + dbuf_flags |= DB_RF_PARTIAL_MORE; + } (void) dbuf_read(db, zio, dbuf_flags); if (db->db_state != DB_CACHED) missed = B_TRUE; @@ -580,7 +609,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zfs_racct_write(length, nblks); if (zs) - dmu_zfetch_run(zs, missed, B_TRUE); + dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { @@ -615,7 +644,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) + uint64_t length, int read, const void *tag, int *numbufsp, + dmu_buf_t ***dbpp) { dnode_t *dn; int err; @@ -634,7 +664,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, - uint64_t length, boolean_t read, void *tag, int *numbufsp, + uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -651,7 +681,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, } void -dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; @@ -668,72 +698,99 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) } /* - * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing - * the data starting at offset, and continuing to offset + len. + * the data starting at offset, and continuing to offset + len. If the range + * it too long, prefetch the first dmu_prefetch_max bytes as requested, while + * for the rest only a higher level, also fitting within dmu_prefetch_max. It + * should primarily help random reads, since for long sequential reads there is + * a speculative prefetcher. * * Note that if the indirect blocks above the blocks being prefetched are not - * in cache, they will be asynchronously read in. + * in cache, they will be asynchronously read in. Dnode read by dnode_hold() + * is currently synchronous. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; - uint64_t blkid; - int nblks, err; - - if (len == 0) { /* they're interested in the bonus buffer */ - dn = DMU_META_DNODE(os); - if (object == 0 || object >= DN_MAX_OBJECT) - return; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, level, - object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, level, blkid, pri, 0); - rw_exit(&dn->dn_struct_rwlock); + if (dmu_prefetch_max == 0 || len == 0) { + dmu_prefetch_dnode(os, object, pri); return; } - /* - * See comment before the definition of dmu_prefetch_max. - */ - len = MIN(len, dmu_prefetch_max); - - /* - * XXX - Note, if the dnode for the requested object is not - * already cached, we will do a *synchronous* read in the - * dnode_hold() call. The same is true for any indirects. - */ - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) + if (dnode_hold(os, object, FTAG, &dn) != 0) return; + dmu_prefetch_by_dnode(dn, level, offset, len, pri); + + dnode_rele(dn, FTAG); +} + +void +dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) +{ + int64_t level2 = level; + uint64_t start, end, start2, end2; + /* - * offset + len - 1 is the last byte we want to prefetch for, and offset - * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the - * last block we want to prefetch, and dbuf_whichblock(dn, level, - * offset) is the first. Then the number we need to prefetch is the - * last - first + 1. + * Depending on len we may do two prefetches: blocks [start, end) at + * level, and following blocks [start2, end2) at higher level2. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (level > 0 || dn->dn_datablkshift != 0) { - nblks = dbuf_whichblock(dn, level, offset + len - 1) - - dbuf_whichblock(dn, level, offset) + 1; + if (dn->dn_datablkshift != 0) { + /* + * The object has multiple blocks. Calculate the full range + * of blocks [start, end2) and then split it into two parts, + * so that the first [start, end) fits into dmu_prefetch_max. + */ + start = dbuf_whichblock(dn, level, offset); + end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1; + uint8_t ibs = dn->dn_indblkshift; + uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs; + uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs; + start2 = end = MIN(end2, start + limit); + + /* + * Find level2 where [start2, end2) fits into dmu_prefetch_max. + */ + uint8_t ibps = ibs - SPA_BLKPTRSHIFT; + limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + do { + level2++; + start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; + end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps; + } while (end2 - start2 > limit); } else { - nblks = (offset < dn->dn_datablksz); + /* There is only one block. Prefetch it or nothing. */ + start = start2 = end2 = 0; + end = start + (level == 0 && offset < dn->dn_datablksz); } - if (nblks != 0) { - blkid = dbuf_whichblock(dn, level, offset); - for (int i = 0; i < nblks; i++) - dbuf_prefetch(dn, level, blkid + i, pri, 0); - } + for (uint64_t i = start; i < end; i++) + dbuf_prefetch(dn, level, i, pri, 0); + for (uint64_t i = start2; i < end2; i++) + dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); +} - dnode_rele(dn, FTAG); +/* + * Issue prefetch I/Os for the given object's dnode. + */ +void +dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) +{ + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + dnode_t *dn = DMU_META_DNODE(os); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, 0, blkid, pri, 0); + rw_exit(&dn->dn_struct_rwlock); } /* @@ -798,7 +855,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) } /* set start to the beginning of this L1 indirect */ - *start = P2ALIGN(*start, iblkrange); + *start = P2ALIGN_TYPED(*start, iblkrange, uint64_t); } if (*start < minimum) *start = minimum; @@ -812,13 +869,14 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ -/*ARGSUSED*/ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); +#else + (void) os; #endif return (B_FALSE); } @@ -1007,7 +1065,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, if (dn->dn_maxblkid == 0) { uint64_t newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); - bzero((char *)buf + newsz, size - newsz); + memset((char *)buf + newsz, 0, size - newsz); size = newsz; } @@ -1087,14 +1145,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; @@ -1302,27 +1360,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) ASSERT(size > 0); - bufoff = zfs_uio_offset(uio) - db->db_offset; + offset_t off = zfs_uio_offset(uio); + bufoff = off - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_TRUE); else dmu_buf_will_dirty(db, tx); - /* - * XXX zfs_uiomove could block forever (eg.nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that zfs_uiomove won't - * block. - */ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) { + /* The fill was reverted. Undo any uio progress. */ + zfs_uio_advance(uio, off - zfs_uio_offset(uio)); + } if (err) break; @@ -1424,7 +1479,7 @@ dmu_return_arcbuf(arc_buf_t *buf) */ int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, - const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx) + const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx) { dbuf_dirty_record_t *dr = dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); @@ -1454,9 +1509,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); - rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned and the arc buf is the @@ -1500,10 +1555,10 @@ typedef struct { dmu_tx_t *dsa_tx; } dmu_sync_arg_t; -/* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { + (void) buf; dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; @@ -1528,10 +1583,10 @@ dmu_sync_late_arrival_ready(zio_t *zio) dmu_sync_ready(zio, NULL, zio->io_private); } -/* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { + (void) buf; dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; @@ -1575,7 +1630,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && - dr->dt.dl.dr_overridden_by.blk_birth == 0) + BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -1606,7 +1661,7 @@ dmu_sync_late_arrival_done(zio_t *zio) blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); - ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } @@ -1626,10 +1681,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, { dmu_sync_arg_t *dsa; dmu_tx_t *tx; + int error; + + error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, + DB_RF_CANFAIL | DB_RF_NOPREFETCH); + if (error != 0) + return (error); tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); - if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + /* + * This transaction does not produce any dirty data or log blocks, so + * it should not be throttled. All other cases wait for TXG sync, by + * which time the log block we are writing will be obsolete, so we can + * skip waiting and just return error here instead. + */ + if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); @@ -1674,7 +1741,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); @@ -1838,9 +1905,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; - zio_nowait(arc_write(pio, os->os_spa, txg, - zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db), - &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, + zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, + dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), + &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -1935,7 +2002,7 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ -int zfs_redundant_metadata_most_ditto_level = 2; +static const int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) @@ -1981,12 +2048,22 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; - if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || - (os->os_redundant_metadata == - ZFS_REDUNDANT_METADATA_MOST && - (level >= zfs_redundant_metadata_most_ditto_level || - DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) + switch (os->os_redundant_metadata) { + case ZFS_REDUNDANT_METADATA_ALL: copies++; + break; + case ZFS_REDUNDANT_METADATA_MOST: + if (level >= zfs_redundant_metadata_most_ditto_level || + DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)) + copies++; + break; + case ZFS_REDUNDANT_METADATA_SOME: + if (DMU_OT_IS_CRITICAL(type)) + copies++; + break; + case ZFS_REDUNDANT_METADATA_NONE: + break; + } } else if (wp & WP_NOFILL) { ASSERT(level == 0); @@ -2072,9 +2149,9 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; - bzero(zp->zp_salt, ZIO_DATA_SALT_LEN); - bzero(zp->zp_iv, ZIO_DATA_IV_LEN); - bzero(zp->zp_mac, ZIO_DATA_MAC_LEN); + memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); + memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); + memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; @@ -2082,18 +2159,18 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } /* - * This function is only called from zfs_holey_common() for zpl_llseek() - * in order to determine the location of holes. In order to accurately - * report holes all dirty data must be synced to disk. This causes extremely - * poor performance when seeking for holes in a dirty file. As a compromise, - * only provide hole data when the dnode is clean. When a dnode is dirty - * report the dnode as having no holes which is always a safe thing to do. + * Reports the location of data and holes in an object. In order to + * accurately report holes all dirty data must be synced to disk. This + * causes extremely poor performance when seeking for holes in a dirty file. + * As a compromise, only provide hole data when the dnode is clean. When + * a dnode is dirty report the dnode as having no holes by returning EBUSY + * which is always safe to do. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; - int err; + int restarted = 0, err; restart: err = dnode_hold(os, object, FTAG, &dn); @@ -2105,19 +2182,23 @@ restart: if (dnode_is_dirty(dn)) { /* * If the zfs_dmu_offset_next_sync module option is enabled - * then strict hole reporting has been requested. Dirty - * dnodes must be synced to disk to accurately report all - * holes. When disabled (the default) dirty dnodes are - * reported to not have any holes which is always safe. + * then hole reporting has been requested. Dirty dnodes + * must be synced to disk to accurately report holes. * - * When called by zfs_holey_common() the zp->z_rangelock - * is held to prevent zfs_write() and mmap writeback from - * re-dirtying the dnode after txg_wait_synced(). + * Provided a RL_READER rangelock spanning 0-UINT64_MAX is + * held by the caller only a single restart will be required. + * We tolerate callers which do not hold the rangelock by + * returning EBUSY and not reporting holes after one restart. */ if (zfs_dmu_offset_next_sync) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); + + if (restarted) + return (SET_ERROR(EBUSY)); + txg_wait_synced(dmu_objset_pool(os), 0); + restarted = 1; goto restart; } @@ -2133,6 +2214,187 @@ restart: return (err); } +int +dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, + blkptr_t *bps, size_t *nbpsp) +{ + dmu_buf_t **dbp, *dbuf; + dmu_buf_impl_t *db; + blkptr_t *bp; + int error, numbufs; + + error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, + &numbufs, &dbp); + if (error != 0) { + if (error == ESRCH) { + error = SET_ERROR(ENXIO); + } + return (error); + } + + ASSERT3U(numbufs, <=, *nbpsp); + + for (int i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + + mutex_enter(&db->db_mtx); + + if (!list_is_empty(&db->db_dirty_records)) { + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + if (dr->dt.dl.dr_brtwrite) { + /* + * This is very special case where we clone a + * block and in the same transaction group we + * read its BP (most likely to clone the clone). + */ + bp = &dr->dt.dl.dr_overridden_by; + } else { + /* + * The block was modified in the same + * transaction group. + */ + mutex_exit(&db->db_mtx); + error = SET_ERROR(EAGAIN); + goto out; + } + } else { + bp = db->db_blkptr; + } + + mutex_exit(&db->db_mtx); + + if (bp == NULL) { + /* + * The file size was increased, but the block was never + * written, otherwise we would either have the block + * pointer or the dirty record and would not get here. + * It is effectively a hole, so report it as such. + */ + BP_ZERO(&bps[i]); + continue; + } + /* + * Make sure we clone only data blocks. + */ + if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * If the block was allocated in transaction group that is not + * yet synced, we could clone it, but we couldn't write this + * operation into ZIL, or it may be impossible to replay, since + * the block may appear not yet allocated at that point. + */ + if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + error = SET_ERROR(EINVAL); + goto out; + } + if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + error = SET_ERROR(EAGAIN); + goto out; + } + + bps[i] = *bp; + } + + *nbpsp = numbufs; +out: + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (error); +} + +int +dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, + dmu_tx_t *tx, const blkptr_t *bps, size_t nbps) +{ + spa_t *spa; + dmu_buf_t **dbp, *dbuf; + dmu_buf_impl_t *db; + struct dirty_leaf *dl; + dbuf_dirty_record_t *dr; + const blkptr_t *bp; + int error = 0, i, numbufs; + + spa = os->os_spa; + + VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, + &numbufs, &dbp)); + ASSERT3U(nbps, ==, numbufs); + + /* + * Before we start cloning make sure that the dbufs sizes match new BPs + * sizes. If they don't, that's a no-go, as we are not able to shrink + * dbufs. + */ + for (i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + bp = &bps[i]; + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); + + if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { + error = SET_ERROR(EXDEV); + goto out; + } + } + + for (i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + bp = &bps[i]; + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); + ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); + + dmu_buf_will_clone(dbuf, tx); + + mutex_enter(&db->db_mtx); + + dr = list_head(&db->db_dirty_records); + VERIFY(dr != NULL); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dl = &dr->dt.dl; + dl->dr_overridden_by = *bp; + if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) { + if (!BP_IS_EMBEDDED(bp)) { + BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg, + BP_GET_BIRTH(bp)); + } else { + BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, + dr->dr_txg); + } + } + dl->dr_brtwrite = B_TRUE; + dl->dr_override_state = DR_OVERRIDDEN; + + mutex_exit(&db->db_mtx); + + /* + * When data in embedded into BP there is no need to create + * BRT entry as there is no data block. Just copy the BP as + * it contains the data. + */ + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + brt_pending_add(spa, bp, tx); + } + } +out: + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (error); +} + void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { @@ -2272,10 +2534,10 @@ byteswap_uint16_array(void *vbuf, size_t size) buf[i] = BSWAP_16(buf[i]); } -/* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { + (void) vbuf, (void) size; } void @@ -2313,6 +2575,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); +EXPORT_SYMBOL(dmu_prefetch_by_dnode); +EXPORT_SYMBOL(dmu_prefetch_dnode); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); @@ -2341,16 +2605,15 @@ EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); -ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW, "Percentage of dirtied blocks from frees in one TXG"); ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); -ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW, +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, "Limit one prefetch call to this size"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c index a573a2e1bd41..0def0956beb8 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_diff.c +++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -108,15 +108,15 @@ report_dnode(dmu_diffarg_t *da, uint64_t object, dnode_phys_t *dnp) (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) -/* ARGSUSED */ static int diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { + (void) zilog; dmu_diffarg_t *da = arg; int err = 0; - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) return (SET_ERROR(EINTR)); if (zb->zb_level == ZB_DNODE_LEVEL || diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c index 12cdbd68b104..56986ea43446 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_object.c +++ b/sys/contrib/openzfs/module/zfs/dmu_object.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -41,12 +41,12 @@ * determined to be the lowest value that eliminates the measurable effect * of lock contention from this code path. */ -int dmu_object_alloc_chunk_shift = 7; +uint_t dmu_object_alloc_chunk_shift = 7; static uint64_t dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) + int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << @@ -55,7 +55,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, int dn_slots = dnodesize >> DNODE_SHIFT; boolean_t restarted = B_FALSE; uint64_t *cpuobj = NULL; - int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; int error; cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE % @@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, * is not suitably aligned. */ os->os_obj_next_chunk = - P2ALIGN(object, dnodes_per_chunk) + + P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) + dnodes_per_chunk; (void) atomic_swap_64(cpuobj, object); mutex_exit(&os->os_obj_lock); @@ -255,7 +255,7 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, - int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) + int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); @@ -409,6 +409,8 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) * hand off to dnode_next_offset() for further scanning. */ while (i <= last_obj) { + if (i == 0) + return (SET_ERROR(ESRCH)); error = dmu_object_info(os, i, &doi); if (error == ENOENT) { if (hole) { @@ -518,6 +520,6 @@ EXPORT_SYMBOL(dmu_object_zapify); EXPORT_SYMBOL(dmu_object_free_zapified); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW, "CPU-specific allocator grabs 2^N objects at once"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index b30a9d619034..8f4fefa4f4dd 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -32,6 +32,7 @@ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -78,16 +79,16 @@ krwlock_t os_lock; * datasets. * Default is 4 times the number of leaf vdevs. */ -int dmu_find_threads = 0; +static const int dmu_find_threads = 0; /* * Backfill lower metadnode objects after this many have been freed. * Backfilling negatively impacts object creation rates, so only do it * if there are enough holes to fill. */ -int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT; +static const int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT; -static char *upgrade_tag = "upgrade_tag"; +static const char *upgrade_tag = "upgrade_tag"; static void dmu_objset_find_dp_cb(void *arg); @@ -263,6 +264,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval) } static void +prefetch_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE || + newval == ZFS_PREFETCH_METADATA); + os->os_prefetch = newval; +} + +static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; @@ -287,7 +301,9 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval) * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || - newval == ZFS_REDUNDANT_METADATA_MOST); + newval == ZFS_REDUNDANT_METADATA_MOST || + newval == ZFS_REDUNDANT_METADATA_SOME || + newval == ZFS_REDUNDANT_METADATA_NONE); os->os_redundant_metadata = newval; } @@ -384,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj) ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); /* - * The low 6 bits of the pointer don't have much entropy, because - * the objset_t is larger than 2^6 bytes long. + * The lower 11 bits of the pointer don't have much entropy, because + * the objset_t is more than 1KB long and so likely aligned to 2KB. */ - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; @@ -416,28 +432,28 @@ dnode_multilist_index_func(multilist_t *ml, void *obj) static inline boolean_t dmu_os_is_l2cacheable(objset_t *os) { - vdev_t *vd = NULL; - zfs_cache_type_t cache = os->os_secondary_cache; - blkptr_t *bp = os->os_rootbp; + if (os->os_secondary_cache == ZFS_CACHE_ALL || + os->os_secondary_cache == ZFS_CACHE_METADATA) { + if (l2arc_exclude_special == 0) + return (B_TRUE); - if (bp != NULL && !BP_IS_HOLE(bp)) { + blkptr_t *bp = os->os_rootbp; + if (bp == NULL || BP_IS_HOLE(bp)) + return (B_FALSE); uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = os->os_spa->spa_root_vdev; + vdev_t *vd = NULL; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; - if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) { - if (vd == NULL) - return (B_TRUE); + if (vd == NULL) + return (B_TRUE); - if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && - vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || - l2arc_exclude_special == 0) - return (B_TRUE); - } + if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) + return (B_TRUE); } - return (B_FALSE); } @@ -479,7 +495,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int size; - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); @@ -516,8 +532,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (arc_buf_size(os->os_phys_buf) < size) { arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, size); - bzero(buf->b_data, size); - bcopy(os->os_phys_buf->b_data, buf->b_data, + memset(buf->b_data, 0, size); + memcpy(buf->b_data, os->os_phys_buf->b_data, arc_buf_size(os->os_phys_buf)); arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; @@ -531,7 +547,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, size); os->os_phys = os->os_phys_buf->b_data; - bzero(os->os_phys, size); + memset(os->os_phys, 0, size); } /* * These properties will be filled in by the logic in zfs_get_zplprop() @@ -559,6 +575,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PREFETCH), + prefetch_changed_cb, os); + } if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, @@ -632,6 +653,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; os->os_dnodesize = DNODE_MIN_SIZE; + os->os_prefetch = ZFS_PREFETCH_ALL; } if (ds == NULL || !ds->ds_is_snapshot) @@ -714,7 +736,7 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) * can be held at a time. */ int -dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, +dmu_objset_hold_flags(const char *name, boolean_t decrypt, const void *tag, objset_t **osp) { dsl_pool_t *dp; @@ -742,18 +764,18 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, } int -dmu_objset_hold(const char *name, void *tag, objset_t **osp) +dmu_objset_hold(const char *name, const void *tag, objset_t **osp) { return (dmu_objset_hold_flags(name, B_FALSE, tag, osp)); } static int dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, - boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) + boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { - int err; + (void) tag; - err = dmu_objset_from_ds(ds, osp); + int err = dmu_objset_from_ds(ds, osp); if (err != 0) { return (err); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { @@ -789,7 +811,7 @@ dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, */ int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) + boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; @@ -834,7 +856,7 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, int dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, - boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) + boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp) { dsl_dataset_t *ds; int err; @@ -855,7 +877,7 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, } void -dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) +dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, const void *tag) { ds_hold_flags_t flags; dsl_pool_t *dp = dmu_objset_pool(os); @@ -866,7 +888,7 @@ dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) } void -dmu_objset_rele(objset_t *os, void *tag) +dmu_objset_rele(objset_t *os, const void *tag) { dmu_objset_rele_flags(os, B_FALSE, tag); } @@ -884,7 +906,7 @@ dmu_objset_rele(objset_t *os, void *tag) */ void dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, - boolean_t decrypt, void *tag) + boolean_t decrypt, const void *tag) { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -904,7 +926,7 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, } void -dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) +dmu_objset_disown(objset_t *os, boolean_t decrypt, const void *tag) { ds_hold_flags_t flags; @@ -1118,12 +1140,14 @@ dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, (!os->os_encrypted || !dmu_objset_is_receiving(os))) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; if (dmu_objset_userobjused_enabled(os)) { + ASSERT3P(ds, !=, NULL); ds->ds_feature_activation[ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; os->os_phys->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; } if (dmu_objset_projectquota_enabled(os)) { + ASSERT3P(ds, !=, NULL); ds->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; os->os_phys->os_flags |= @@ -1157,7 +1181,6 @@ typedef struct dmu_objset_create_arg { dsl_crypto_params_t *doca_dcp; } dmu_objset_create_arg_t; -/*ARGSUSED*/ static int dmu_objset_create_check(void *arg, dmu_tx_t *tx) { @@ -1299,6 +1322,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(spa, ds->ds_key_mapping, ds); dsl_dataset_sync_done(ds, tx); + dmu_buf_rele(ds->ds_dbuf, ds); } mutex_enter(&ds->ds_lock); @@ -1353,7 +1377,6 @@ typedef struct dmu_objset_clone_arg { proc_t *doca_proc; } dmu_objset_clone_arg_t; -/*ARGSUSED*/ static int dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { @@ -1565,10 +1588,10 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) } } -/* ARGSUSED */ static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { + (void) abuf; blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; @@ -1596,10 +1619,10 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); } -/* ARGSUSED */ static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { + (void) abuf; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; @@ -1616,28 +1639,92 @@ dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) kmem_free(bp, sizeof (*bp)); } +typedef struct sync_objset_arg { + zio_t *soa_zio; + objset_t *soa_os; + dmu_tx_t *soa_tx; + kmutex_t soa_mutex; + int soa_count; + taskq_ent_t soa_tq_ent; +} sync_objset_arg_t; + typedef struct sync_dnodes_arg { - multilist_t *sda_list; - int sda_sublist_idx; - multilist_t *sda_newlist; - dmu_tx_t *sda_tx; + multilist_t *sda_list; + int sda_sublist_idx; + multilist_t *sda_newlist; + sync_objset_arg_t *sda_soa; } sync_dnodes_arg_t; +static void sync_meta_dnode_task(void *arg); + static void sync_dnodes_task(void *arg) { sync_dnodes_arg_t *sda = arg; + sync_objset_arg_t *soa = sda->sda_soa; + objset_t *os = soa->soa_os; + uint_t allocator = spa_acq_allocator(os->os_spa); multilist_sublist_t *ms = - multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); + multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx); - dmu_objset_sync_dnodes(ms, sda->sda_tx); + dmu_objset_sync_dnodes(ms, soa->soa_tx); multilist_sublist_unlock(ms); + spa_rel_allocator(os->os_spa, allocator); kmem_free(sda, sizeof (*sda)); + + mutex_enter(&soa->soa_mutex); + ASSERT(soa->soa_count != 0); + if (--soa->soa_count != 0) { + mutex_exit(&soa->soa_mutex); + return; + } + mutex_exit(&soa->soa_mutex); + + taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq, + sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent); } +/* + * Issue the zio_nowait() for all dirty record zios on the meta dnode, + * then trigger the callback for the zil_sync. This runs once for each + * objset, only after any/all sublists in the objset have been synced. + */ +static void +sync_meta_dnode_task(void *arg) +{ + sync_objset_arg_t *soa = arg; + objset_t *os = soa->soa_os; + dmu_tx_t *tx = soa->soa_tx; + int txgoff = tx->tx_txg & TXG_MASK; + dbuf_dirty_record_t *dr; + + ASSERT0(soa->soa_count); + + list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_remove_head(list)) != NULL) { + ASSERT0(dr->dr_dbuf->db_level); + zio_nowait(dr->dr_zio); + } + + /* Enable dnode backfill if enough objects have been freed. */ + if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { + os->os_rescan_dnodes = B_TRUE; + os->os_freed_dnodes = 0; + } + + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + zio_nowait(soa->soa_zio); + + mutex_destroy(&soa->soa_mutex); + kmem_free(soa, sizeof (*soa)); +} /* called from dsl */ void @@ -1647,8 +1734,6 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; - list_t *list; - dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); @@ -1693,8 +1778,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) } zio = arc_write(pio, os->os_spa, tx->tx_txg, - blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os), - &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, + blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os), + &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* @@ -1735,40 +1820,49 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) offsetof(dnode_t, dn_dirty_link[txgoff])); } + /* + * zio_nowait(zio) is done after any/all sublist and meta dnode + * zios have been nowaited, and the zil_sync() has been performed. + * The soa is freed at the end of sync_meta_dnode_task. + */ + sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP); + soa->soa_zio = zio; + soa->soa_os = os; + soa->soa_tx = tx; + taskq_init_ent(&soa->soa_tq_ent); + mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL); + ml = &os->os_dirty_dnodes[txgoff]; - num_sublists = multilist_get_num_sublists(ml); + soa->soa_count = num_sublists = multilist_get_num_sublists(ml); + for (int i = 0; i < num_sublists; i++) { if (multilist_sublist_is_empty_idx(ml, i)) - continue; - sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); - sda->sda_list = ml; - sda->sda_sublist_idx = i; - sda->sda_tx = tx; - (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - sync_dnodes_task, sda, 0); - /* callback frees sda */ + soa->soa_count--; } - taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); - list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_head(list)) != NULL) { - ASSERT0(dr->dr_dbuf->db_level); - list_remove(list, dr); - zio_nowait(dr->dr_zio); - } - - /* Enable dnode backfill if enough objects have been freed. */ - if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { - os->os_rescan_dnodes = B_TRUE; - os->os_freed_dnodes = 0; + if (soa->soa_count == 0) { + taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq, + sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent); + } else { + /* + * Sync sublists in parallel. The last to finish + * (i.e., when soa->soa_count reaches zero) must + * dispatch sync_meta_dnode_task. + */ + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(ml, i)) + continue; + sync_dnodes_arg_t *sda = + kmem_alloc(sizeof (*sda), KM_SLEEP); + sda->sda_list = ml; + sda->sda_sublist_idx = i; + sda->sda_soa = soa; + (void) taskq_dispatch( + dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_task, sda, 0); + /* sync_dnodes_task frees sda */ + } } - - /* - * Free intent log blocks up to this tx. - */ - zil_sync(os->os_zil, tx); - os->os_phys->os_zil_header = os->os_zil_header; - zio_nowait(zio); } boolean_t @@ -1984,8 +2078,8 @@ userquota_updates_task(void *arg) dnode_t *dn; userquota_cache_t cache = { { 0 } }; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); @@ -2067,8 +2161,8 @@ dnode_rele_task(void *arg) userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; - multilist_sublist_t *list = - multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); + multilist_sublist_t *list = multilist_sublist_lock_idx( + &os->os_synced_dnodes, uua->uua_sublist_idx); dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { @@ -2343,7 +2437,7 @@ dmu_objset_space_upgrade(objset_t *os) if (err != 0) return (err); - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index 0ec46bdb4f47..0119191d7920 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -27,8 +27,12 @@ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2022 Axcient. */ +#include <sys/arc.h> +#include <sys/spa_impl.h> #include <sys/dmu.h> #include <sys/dmu_impl.h> #include <sys/dmu_send.h> @@ -64,12 +68,19 @@ #endif #include <sys/zfs_file.h> -int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; -int zfs_recv_queue_ff = 20; -int zfs_recv_write_batch_size = 1024 * 1024; +static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE; +static uint_t zfs_recv_queue_ff = 20; +static uint_t zfs_recv_write_batch_size = 1024 * 1024; +static int zfs_recv_best_effort_corrective = 0; -static char *dmu_recv_tag = "dmu_recv_tag"; -const char *recv_clone_name = "%recv"; +static const void *const dmu_recv_tag = "dmu_recv_tag"; +const char *const recv_clone_name = "%recv"; + +typedef enum { + ORNS_NO, + ORNS_YES, + ORNS_MAYBE +} or_need_sync_t; static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, void *buf); @@ -102,6 +113,8 @@ struct receive_writer_arg { boolean_t done; int err; + const char *tofs; + boolean_t heal; boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ @@ -121,6 +134,10 @@ struct receive_writer_arg { uint8_t or_iv[ZIO_DATA_IV_LEN]; uint8_t or_mac[ZIO_DATA_MAC_LEN]; boolean_t or_byteorder; + zio_t *heal_pio; + + /* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */ + or_need_sync_t or_need_sync; }; typedef struct dmu_recv_begin_arg { @@ -343,9 +360,10 @@ static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) { - uint64_t val; + uint64_t obj; uint64_t children; int error; + dsl_dataset_t *snap; dsl_pool_t *dp = ds->ds_dir->dd_pool; boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; @@ -354,7 +372,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, - 8, 1, &val); + 8, 1, &obj); if (error != ENOENT) return (error == 0 ? SET_ERROR(EBUSY) : error); @@ -362,12 +380,16 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (dsl_dataset_has_resume_receive_state(ds)) return (SET_ERROR(EBUSY)); - /* New snapshot name must not exist. */ + /* New snapshot name must not exist if we're not healing it. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, - drba->drba_cookie->drc_tosnap, 8, 1, &val); - if (error != ENOENT) + drba->drba_cookie->drc_tosnap, 8, 1, &obj); + if (drba->drba_cookie->drc_heal) { + if (error != 0) + return (error); + } else if (error != ENOENT) { return (error == 0 ? SET_ERROR(EEXIST) : error); + } /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, @@ -392,8 +414,40 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (error != 0) return (error); - if (fromguid != 0) { - dsl_dataset_t *snap; + if (drba->drba_cookie->drc_heal) { + /* Encryption is incompatible with embedded data. */ + if (encrypted && embed) + return (SET_ERROR(EINVAL)); + + /* Healing is not supported when in 'force' mode. */ + if (drba->drba_cookie->drc_force) + return (SET_ERROR(EINVAL)); + + /* Must have keys loaded if doing encrypted non-raw recv. */ + if (encrypted && !raw) { + if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object, + NULL, NULL) != 0) + return (SET_ERROR(EACCES)); + } + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); + if (error != 0) + return (error); + + /* + * When not doing best effort corrective recv healing can only + * be done if the send stream is for the same snapshot as the + * one we are trying to heal. + */ + if (zfs_recv_best_effort_corrective == 0 && + drba->drba_cookie->drc_drrb->drr_toguid != + dsl_dataset_phys(snap)->ds_guid) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + dsl_dataset_rele(snap, FTAG); + } else if (fromguid != 0) { + /* Sanity check the incremental recv */ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Can't perform a raw receive on top of a non-raw receive */ @@ -459,7 +513,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_rele(snap, FTAG); } else { - /* if full, then must be forced */ + /* If full and not healing then must be forced. */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); @@ -597,7 +651,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) if (!(flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); } else { - dsflags |= DS_HOLD_FLAG_DECRYPT; + /* + * We support unencrypted datasets below encrypted ones now, + * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing + * with a dataset we may encrypt. + */ + if (drba->drba_dcp == NULL || + drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } } error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); @@ -618,6 +680,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) char buf[ZFS_MAX_DATASET_NAME_LEN]; objset_t *os; + /* healing recv must be done "into" an existing snapshot */ + if (drba->drba_cookie->drc_heal == B_TRUE) + return (SET_ERROR(ENOTSUP)); + /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. @@ -799,7 +865,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { - /* create temporary clone */ + /* Create temporary clone unless we're doing corrective recv */ dsl_dataset_t *snap = NULL; if (drba->drba_cookie->drc_fromsnapobj != 0) { @@ -807,8 +873,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); ASSERT3P(dcp, ==, NULL); } - dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, - snap, crflags, drba->drba_cred, dcp, tx); + if (drc->drc_heal) { + /* When healing we want to use the provided snapshot */ + VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap, + &dsobj)); + } else { + dsobj = dsl_dataset_create_sync(ds->ds_dir, + recv_clone_name, snap, crflags, drba->drba_cred, + dcp, tx); + } if (drba->drba_cookie->drc_fromsnapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); @@ -925,7 +998,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && - (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) { + (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && + !drc->drc_heal) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } @@ -981,13 +1055,24 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) dsflags |= DS_HOLD_FLAG_DECRYPT; } + boolean_t recvexist = B_TRUE; if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ + recvexist = B_FALSE; error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error != 0) return (error); } + /* + * Resume of full/newfs recv on existing dataset should be done with + * force flag + */ + if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(ZFS_ERR_RESUME_EXISTS)); + } + /* check that ds is marked inconsistent */ if (!DS_IS_INCONSISTENT(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); @@ -1132,20 +1217,22 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, - boolean_t force, boolean_t resumable, nvlist_t *localprops, - nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, - zfs_file_t *fp, offset_t *voffp) +dmu_recv_begin(const char *tofs, const char *tosnap, + dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal, + boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args, + const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp, + offset_t *voffp) { dmu_recv_begin_arg_t drba = { 0 }; - int err; + int err = 0; - bzero(drc, sizeof (dmu_recv_cookie_t)); + memset(drc, 0, sizeof (dmu_recv_cookie_t)); drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_heal = heal; drc->drc_resumable = resumable; drc->drc_cred = CRED(); drc->drc_proc = curproc; @@ -1169,20 +1256,36 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; - void *payload = NULL; - if (payloadlen != 0) - payload = kmem_alloc(payloadlen, KM_SLEEP); - err = receive_read_payload_and_next_header(drc, payloadlen, - payload); - if (err != 0) { - kmem_free(payload, payloadlen); - return (err); - } + /* + * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace + * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard + * upper limit. Systems with less than 1GB of RAM will see a lower + * limit from `arc_all_memory() / 4`. + */ + if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4))) + return (E2BIG); + + if (payloadlen != 0) { + void *payload = vmem_alloc(payloadlen, KM_SLEEP); + /* + * For compatibility with recursive send streams, we don't do + * this here if the stream could be part of a package. Instead, + * we'll do it in dmu_recv_stream. If we pull the next header + * too early, and it's the END record, we break the `recv_skip` + * logic. + */ + + err = receive_read_payload_and_next_header(drc, payloadlen, + payload); + if (err != 0) { + vmem_free(payload, payloadlen); + return (err); + } err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl, KM_SLEEP); - kmem_free(payload, payloadlen); + vmem_free(payload, payloadlen); if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); @@ -1203,7 +1306,6 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL); } else { - /* * For non-raw, non-incremental, non-resuming receives the * user can specify encryption parameters on the command line @@ -1236,6 +1338,186 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, return (err); } +/* + * Holds data need for corrective recv callback + */ +typedef struct cr_cb_data { + uint64_t size; + zbookmark_phys_t zb; + spa_t *spa; +} cr_cb_data_t; + +static void +corrective_read_done(zio_t *zio) +{ + cr_cb_data_t *data = zio->io_private; + /* Corruption corrected; update error log if needed */ + if (zio->io_error == 0) { + spa_remove_error(data->spa, &data->zb, + BP_GET_LOGICAL_BIRTH(zio->io_bp)); + } + kmem_free(data, sizeof (cr_cb_data_t)); + abd_free(zio->io_abd); +} + +/* + * zio_rewrite the data pointed to by bp with the data from the rrd's abd. + */ +static int +do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, + struct receive_record_arg *rrd, blkptr_t *bp) +{ + int err; + zio_t *io; + zbookmark_phys_t zb; + dnode_t *dn; + abd_t *abd = rrd->abd; + zio_cksum_t bp_cksum = bp->blk_cksum; + zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL; + + if (rwa->raw) + flags |= ZIO_FLAG_RAW; + + err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn); + if (err != 0) + return (err); + SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0, + dbuf_whichblock(dn, 0, drrw->drr_offset)); + dnode_rele(dn, FTAG); + + if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) { + /* Decompress the stream data */ + abd_t *dabd = abd_alloc_linear( + drrw->drr_logical_size, B_FALSE); + err = zio_decompress_data(drrw->drr_compressiontype, + abd, abd_to_buf(dabd), abd_get_size(abd), + abd_get_size(dabd), NULL); + + if (err != 0) { + abd_free(dabd); + return (err); + } + /* Swap in the newly decompressed data into the abd */ + abd_free(abd); + abd = dabd; + } + + if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + /* Recompress the data */ + abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp), + B_FALSE); + void *buf = abd_to_buf(cabd); + uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp), + abd, &buf, abd_get_size(abd), + rwa->os->os_complevel); + abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize); + /* Swap in newly compressed data into the abd */ + abd_free(abd); + abd = cabd; + flags |= ZIO_FLAG_RAW_COMPRESS; + } + + /* + * The stream is not encrypted but the data on-disk is. + * We need to re-encrypt the buf using the same + * encryption type, salt, iv, and mac that was used to encrypt + * the block previosly. + */ + if (!rwa->raw && BP_USES_CRYPT(bp)) { + dsl_dataset_t *ds; + dsl_crypto_key_t *dck = NULL; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + dsl_pool_t *dp = dmu_objset_pool(rwa->os); + abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); + + zio_crypt_decode_params_bp(bp, salt, iv); + zio_crypt_decode_mac_bp(bp, mac); + + dsl_pool_config_enter(dp, FTAG); + err = dsl_dataset_hold_flags(dp, rwa->tofs, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + abd_free(eabd); + return (SET_ERROR(EACCES)); + } + + /* Look up the key from the spa's keystore */ + err = spa_keystore_lookup_key(rwa->os->os_spa, + zb.zb_objset, FTAG, &dck); + if (err != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, + FTAG); + dsl_pool_config_exit(dp, FTAG); + abd_free(eabd); + return (SET_ERROR(EACCES)); + } + + err = zio_do_crypt_abd(B_TRUE, &dck->dck_key, + BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, + mac, abd_get_size(abd), abd, eabd, &no_crypt); + + spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + dsl_pool_config_exit(dp, FTAG); + + ASSERT0(no_crypt); + if (err != 0) { + abd_free(eabd); + return (err); + } + /* Swap in the newly encrypted data into the abd */ + abd_free(abd); + abd = eabd; + + /* + * We want to prevent zio_rewrite() from trying to + * encrypt the data again + */ + flags |= ZIO_FLAG_RAW_ENCRYPT; + } + rrd->abd = abd; + + io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp, + abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, + &zb); + + ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) || + abd_get_size(abd) == BP_GET_PSIZE(bp)); + + /* compute new bp checksum value and make sure it matches the old one */ + zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd)); + if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) { + zio_destroy(io); + if (zfs_recv_best_effort_corrective != 0) + return (0); + return (SET_ERROR(ECKSUM)); + } + + /* Correct the corruption in place */ + err = zio_wait(io); + if (err == 0) { + cr_cb_data_t *cb_data = + kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP); + cb_data->spa = rwa->os->os_spa; + cb_data->size = drrw->drr_logical_size; + cb_data->zb = zb; + /* Test if healing worked by re-reading the bp */ + err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp, + abd_alloc_for_io(drrw->drr_logical_size, B_FALSE), + drrw->drr_logical_size, corrective_read_done, + cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL)); + } + if (err != 0 && zfs_recv_best_effort_corrective != 0) + err = 0; + + return (err); +} + static int receive_read(dmu_recv_cookie_t *drc, int len, void *buf) { @@ -1249,11 +1531,11 @@ receive_read(dmu_recv_cookie_t *drc, int len, void *buf) (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); while (done < len) { - ssize_t resid; + ssize_t resid = len - done; zfs_file_t *fp = drc->drc_fp; int err = zfs_file_read(fp, (char *)buf + done, len - done, &resid); - if (resid == len - done) { + if (err == 0 && resid == len - done) { /* * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates * that the receive was interrupted and can @@ -1516,17 +1798,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa, } /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. + * The dmu does not currently support decreasing nlevels or changing + * indirect block size if there is already one, same as changing the + * number of of dnode slots on an object. For non-raw sends this + * does not matter and the new object can just use the previous one's + * parameters. For raw sends, however, the structure of the received + * dnode (including indirects and dnode slots) must match that of the + * send side. Therefore, instead of using dmu_object_reclaim(), we + * must free the object completely and call dmu_object_claim_dnsize() + * instead. */ - if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + if ((rwa->raw && ((doi->doi_indirection > 1 && + indblksz != doi->doi_metadata_block_size) || + drro->drr_nlevels < doi->doi_indirection)) || dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_object(rwa->os, drro->drr_object); if (err != 0) @@ -1634,6 +1918,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, if (err == 0) { err = receive_handle_existing_object(rwa, drro, &doi, data, &object_to_hold, &new_blksz); + if (err != 0) + return (err); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a @@ -1650,10 +1936,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, /* object was freed and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } else { + /* + * If the only record in this range so far was DRR_FREEOBJECTS + * with at least one actually freed object, it's possible that + * the block will now be converted to a hole. We need to wait + * for the txg to sync to prevent races. + */ + if (rwa->or_need_sync == ORNS_YES) + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + /* object is free and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } + /* Only relevant for the first object in the range */ + rwa->or_need_sync = ORNS_NO; + /* * If this is a multi-slot dnode there is a chance that this * object will expand into a slot that is already used by @@ -1800,7 +2098,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); + memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro)); /* * Raw bonus buffers have their byteorder determined by the @@ -1815,12 +2113,21 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, dmu_buf_rele(db, FTAG); dnode_rele(dn, FTAG); } + + /* + * If the receive fails, we want the resume stream to start with the + * same record that we last successfully received. There is no way to + * request resume from the object record, but we can benefit from the + * fact that sender always sends object record before anything else, + * after which it will "resend" data at offset 0 and resume normally. + */ + save_resume_state(rwa, drro->drr_object, 0, tx); + dmu_tx_commit(tx); return (0); } -/* ARGSUSED */ noinline static int receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) @@ -1848,6 +2155,9 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); + + if (rwa->or_need_sync == ORNS_MAYBE) + rwa->or_need_sync = ORNS_YES; } if (next_err != ESRCH) return (next_err); @@ -1931,10 +2241,10 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) if (err == 0) abd_free(abd); } else { - zio_prop_t zp; + zio_prop_t zp = {0}; dmu_write_policy(rwa->os, dn, 0, 0, &zp); - enum zio_flag zio_flags = 0; + zio_flag_t zio_flags = 0; if (rwa->raw) { zp.zp_encrypt = B_TRUE; @@ -1942,11 +2252,11 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ rwa->byteswap; - bcopy(drrw->drr_salt, zp.zp_salt, + memcpy(zp.zp_salt, drrw->drr_salt, ZIO_DATA_SALT_LEN); - bcopy(drrw->drr_iv, zp.zp_iv, + memcpy(zp.zp_iv, drrw->drr_iv, ZIO_DATA_IV_LEN); - bcopy(drrw->drr_mac, zp.zp_mac, + memcpy(zp.zp_mac, drrw->drr_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { zp.zp_nopwrite = B_FALSE; @@ -2043,6 +2353,53 @@ receive_process_write_record(struct receive_writer_arg *rwa, !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); + if (rwa->heal) { + blkptr_t *bp; + dmu_buf_t *dbp; + int flags = DB_RF_CANFAIL; + + if (rwa->raw) + flags |= DB_RF_NO_DECRYPT; + + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd), + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + err = dmu_buf_hold_noread(rwa->os, drrw->drr_object, + drrw->drr_offset, FTAG, &dbp); + if (err != 0) + return (err); + + /* Try to read the object to see if it needs healing */ + err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags); + /* + * We only try to heal when dbuf_read() returns a ECKSUMs. + * Other errors (even EIO) get returned to caller. + * EIO indicates that the device is not present/accessible, + * so writing to it will likely fail. + * If the block is healthy, we don't want to overwrite it + * unnecessarily. + */ + if (err != ECKSUM) { + dmu_buf_rele(dbp, FTAG); + return (err); + } + /* Make sure the on-disk block and recv record sizes match */ + if (drrw->drr_logical_size != dbp->db_size) { + err = ENOTSUP; + dmu_buf_rele(dbp, FTAG); + return (err); + } + /* Get the block pointer for the corrupted block */ + bp = dmu_buf_get_blkptr(dbp); + err = do_corrective_recv(rwa, drrw, rrd, bp); + dmu_buf_rele(dbp, FTAG); + return (err); + } + /* * For resuming to work, records must be in increasing order * by (object, offset). @@ -2183,7 +2540,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * size of the provided arc_buf_t. */ if (db_spill->db_size != drrs->drr_length) { - dmu_buf_will_fill(db_spill, tx); + dmu_buf_will_fill(db_spill, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } @@ -2211,7 +2568,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, } } - bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs)); + memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); @@ -2222,7 +2579,6 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, return (0); } -/* ARGSUSED */ noinline static int receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { @@ -2285,11 +2641,13 @@ receive_object_range(struct receive_writer_arg *rwa, rwa->or_crypt_params_present = B_TRUE; rwa->or_firstobj = drror->drr_firstobj; rwa->or_numslots = drror->drr_numslots; - bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN); - bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN); - bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN); + memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN); + memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN); + memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN); rwa->or_byteorder = byteorder; + rwa->or_need_sync = ORNS_MAYBE; + return (0); } @@ -2297,7 +2655,6 @@ receive_object_range(struct receive_writer_arg *rwa, * Until we have the ability to redact large ranges of data efficiently, we * process these records as frees. */ -/* ARGSUSED */ noinline static int receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr) { @@ -2337,7 +2694,8 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); - (void) dsl_destroy_head(name); + if (!drc->drc_heal) + (void) dsl_destroy_head(name); } } @@ -2446,7 +2804,6 @@ receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) * numbers in the ignore list. In practice, we receive up to 32 object records * before receiving write records, so the list can have up to 32 nodes in it. */ -/* ARGSUSED */ static void receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset, uint64_t length) @@ -2699,7 +3056,19 @@ receive_process_record(struct receive_writer_arg *rwa, ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; - if (rrd->header.drr_type != DRR_WRITE) { + /* We can only heal write records; other ones get ignored */ + if (rwa->heal && rrd->header.drr_type != DRR_WRITE) { + if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + return (0); + } + + if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) { err = flush_write_batch(rwa); if (err != 0) { if (rrd->abd != NULL) { @@ -2734,9 +3103,16 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_WRITE: { err = receive_process_write_record(rwa, rrd); - if (err != EAGAIN) { + if (rwa->heal) { + /* + * If healing - always free the abd after processing + */ + abd_free(rrd->abd); + rrd->abd = NULL; + } else if (err != EAGAIN) { /* - * On success, receive_process_write_record() returns + * On success, a non-healing + * receive_process_write_record() returns * EAGAIN to indicate that we do not want to free * the rrd or arc_buf. */ @@ -2798,7 +3174,7 @@ receive_process_record(struct receive_writer_arg *rwa, * dmu_recv_stream's worker thread; pull records off the queue, and then call * receive_process_record When we're done, signal the main thread and exit. */ -static void +static __attribute__((noreturn)) void receive_writer_thread(void *arg) { struct receive_writer_arg *rwa = arg; @@ -2827,8 +3203,9 @@ receive_writer_thread(void *arg) * EAGAIN indicates that this record has been saved (on * raw->write_batch), and will be used again, so we don't * free it. + * When healing data we always need to free the record. */ - if (err != EAGAIN) { + if (err != EAGAIN || rwa->heal) { if (rwa->err == 0) rwa->err = err; kmem_free(rrd, sizeof (*rrd)); @@ -2836,10 +3213,13 @@ receive_writer_thread(void *arg) } kmem_free(rrd, sizeof (*rrd)); - int err = flush_write_batch(rwa); - if (rwa->err == 0) - rwa->err = err; - + if (rwa->heal) { + zio_wait(rwa->heal_pio); + } else { + int err = flush_write_batch(rwa); + if (rwa->err == 0) + rwa->err = err; + } mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); @@ -2923,17 +3303,19 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) if (err != 0) goto out; - /* - * If this is a new dataset we set the key immediately. - * Otherwise we don't want to change the key until we - * are sure the rest of the receive succeeded so we stash - * the keynvl away until then. - */ - err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), - drc->drc_ds->ds_object, drc->drc_fromsnapobj, - drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); - if (err != 0) - goto out; + if (!drc->drc_heal) { + /* + * If this is a new dataset we set the key immediately. + * Otherwise we don't want to change the key until we + * are sure the rest of the receive succeeded so we + * stash the keynvl away until then. + */ + err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), + drc->drc_ds->ds_object, drc->drc_fromsnapobj, + drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); + if (err != 0) + goto out; + } /* see comment in dmu_recv_end_sync() */ drc->drc_ivset_guid = 0; @@ -2951,6 +3333,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) } /* + * For compatibility with recursive send streams, we do this here, + * rather than in dmu_recv_begin. If we pull the next header too + * early, and it's the END record, we break the `recv_skip` logic. + */ + if (drc->drc_drr_begin->drr_payloadlen == 0) { + err = receive_read_payload_and_next_header(drc, 0, NULL); + if (err != 0) + goto out; + } + + /* * If we failed before this point we will clean up any new resume * state that was created. Now that we've gotten past the initial * checks we are ok to retain that resume state. @@ -2964,11 +3357,17 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = drc->drc_os; rwa->byteswap = drc->drc_byteswap; + rwa->heal = drc->drc_heal; + rwa->tofs = drc->drc_tofs; rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; + if (drc->drc_heal) { + rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL, + ZIO_FLAG_GODFATHER); + } list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); @@ -2990,7 +3389,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) * stream, then we free drc->drc_rrd and exit. */ while (rwa->err == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { err = SET_ERROR(EINTR); break; } @@ -3104,7 +3503,9 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); - if (!drc->drc_newfs) { + if (drc->drc_heal) { + error = 0; + } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); @@ -3180,13 +3581,18 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; - uint64_t newsnapobj; + uint64_t newsnapobj = 0; spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; - if (!drc->drc_newfs) { + if (drc->drc_heal) { + if (drc->drc_keynvl != NULL) { + nvlist_free(drc->drc_keynvl); + drc->drc_keynvl = NULL; + } + } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, @@ -3300,7 +3706,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) * tunable is set, in which case we will leave the newly-generated * value. */ - if (drc->drc_raw && drc->drc_ivset_guid != 0) { + if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) { dmu_object_zapify(dp->dp_meta_objset, newsnapobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj, @@ -3367,7 +3773,7 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) if (error != 0) { dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); - } else { + } else if (!drc->drc_heal) { if (drc->drc_newfs) { zvol_create_minor(drc->drc_tofs); } @@ -3389,13 +3795,15 @@ dmu_objset_is_receiving(objset_t *os) os->os_dsl_dataset->ds_owner == dmu_recv_tag); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW, "Maximum receive queue length"); -ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW, "Receive queue fill fraction"); -ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW, "Maximum amount of writes to batch into one transaction"); + +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW, + "Ignore errors during corrective receive"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c index fdbdf7d6e868..1feba0ba83de 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_redact.c +++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -40,13 +40,14 @@ * This controls the number of entries in the buffer the redaction_list_update * synctask uses to buffer writes to the redaction list. */ -int redact_sync_bufsize = 1024; +static const int redact_sync_bufsize = 1024; /* * Controls how often to update the redaction list when creating a redaction * list. */ -uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */ +static const uint64_t redaction_list_update_interval_ns = + 1000 * 1000 * 1000ULL; /* 1s */ /* * This tunable controls the length of the queues that zfs redact worker threads @@ -56,7 +57,7 @@ uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */ * available IO resources, or the queues are consuming too much memory, this * variable may need to be decreased. */ -int zfs_redact_queue_length = 1024 * 1024; +static const int zfs_redact_queue_length = 1024 * 1024; /* * These tunables control the fill fraction of the queues by zfs redact. The @@ -65,7 +66,7 @@ int zfs_redact_queue_length = 1024 * 1024; * should be tuned down. If the queues empty before the signalled thread can * catch up, then these should be tuned up. */ -uint64_t zfs_redact_queue_ff = 20; +static const uint64_t zfs_redact_queue_ff = 20; struct redact_record { bqueue_node_t ln; @@ -141,7 +142,7 @@ record_merge_enqueue(bqueue_t *q, struct redact_record **build, { if (new->eos_marker) { if (*build != NULL) - bqueue_enqueue(q, *build, sizeof (*build)); + bqueue_enqueue(q, *build, sizeof (**build)); bqueue_enqueue_flush(q, new, sizeof (*new)); return; } @@ -249,11 +250,11 @@ zfs_get_deleteq(objset_t *os) * Third, if there is a deleted object, we need to create a redaction record for * all of the blocks in that object. */ -/*ARGSUSED*/ static int redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { + (void) spa, (void) zilog; struct redact_thread_arg *rta = arg; struct redact_record *record; @@ -350,7 +351,7 @@ redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (0); } -static void +static __attribute__((noreturn)) void redact_traverse_thread(void *arg) { struct redact_thread_arg *rt_arg = arg; @@ -745,10 +746,8 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, bqueue_enqueue(q, record, sizeof (*record)); return (0); } - if (num_threads > 0) { - redact_nodes = kmem_zalloc(num_threads * - sizeof (*redact_nodes), KM_SLEEP); - } + redact_nodes = vmem_zalloc(num_threads * + sizeof (*redact_nodes), KM_SLEEP); avl_create(&start_tree, redact_node_compare_start, sizeof (struct redact_node), @@ -821,9 +820,9 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, avl_destroy(&start_tree); avl_destroy(&end_tree); - kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); + vmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) - bqueue_enqueue(q, current_record, sizeof (current_record)); + bqueue_enqueue(q, current_record, sizeof (*current_record)); return (err); } @@ -836,7 +835,7 @@ struct redact_merge_thread_arg { int error_code; }; -static void +static __attribute__((noreturn)) void redact_merge_thread(void *arg) { struct redact_merge_thread_arg *rmta = arg; @@ -854,7 +853,7 @@ redact_merge_thread(void *arg) * object number. */ static int -hold_next_object(objset_t *os, struct redact_record *rec, void *tag, +hold_next_object(objset_t *os, struct redact_record *rec, const void *tag, uint64_t *object, dnode_t **dn) { int err = 0; @@ -913,7 +912,7 @@ perform_redaction(objset_t *os, redaction_list_t *rl, object = prev_obj; } while (err == 0 && object <= rec->end_object) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { err = EINTR; break; } @@ -1031,7 +1030,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, numsnaps = fnvlist_num_pairs(redactnvl); if (numsnaps > 0) - args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + args = vmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); nvpair_t *pair = NULL; for (int i = 0; i < numsnaps; i++) { @@ -1080,7 +1079,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); if (args != NULL) - kmem_free(args, numsnaps * sizeof (*args)); + vmem_free(args, numsnaps * sizeof (*args)); return (SET_ERROR(ENAMETOOLONG)); } err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); @@ -1120,7 +1119,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, } else { uint64_t *guids = NULL; if (numsnaps > 0) { - guids = kmem_zalloc(numsnaps * sizeof (uint64_t), + guids = vmem_zalloc(numsnaps * sizeof (uint64_t), KM_SLEEP); } for (int i = 0; i < numsnaps; i++) { @@ -1132,10 +1131,9 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, dp = NULL; err = dsl_bookmark_create_redacted(newredactbook, snapname, numsnaps, guids, FTAG, &new_rl); - kmem_free(guids, numsnaps * sizeof (uint64_t)); - if (err != 0) { + vmem_free(guids, numsnaps * sizeof (uint64_t)); + if (err != 0) goto out; - } } for (int i = 0; i < numsnaps; i++) { @@ -1189,7 +1187,7 @@ out: } if (args != NULL) - kmem_free(args, numsnaps * sizeof (*args)); + vmem_free(args, numsnaps * sizeof (*args)); if (dp != NULL) dsl_pool_rele(dp, FTAG); if (ds != NULL) { diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 0658e13c2d25..cb2b62fed313 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -67,7 +67,7 @@ #endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ -int zfs_send_corrupt_data = B_FALSE; +static int zfs_send_corrupt_data = B_FALSE; /* * This tunable controls the amount of data (measured in bytes) that will be * prefetched by zfs send. If the main thread is blocking on reads that haven't @@ -75,7 +75,7 @@ int zfs_send_corrupt_data = B_FALSE; * thread is issuing new reads because the prefetches have fallen out of the * cache, this may need to be decreased. */ -int zfs_send_queue_length = SPA_MAXBLOCKSIZE; +static uint_t zfs_send_queue_length = SPA_MAXBLOCKSIZE; /* * This tunable controls the length of the queues that zfs send worker threads * use to communicate. If the send_main_thread is blocking on these queues, @@ -83,7 +83,7 @@ int zfs_send_queue_length = SPA_MAXBLOCKSIZE; * at the start of a send as these threads consume all the available IO * resources, this variable may need to be decreased. */ -int zfs_send_no_prefetch_queue_length = 1024 * 1024; +static uint_t zfs_send_no_prefetch_queue_length = 1024 * 1024; /* * These tunables control the fill fraction of the queues by zfs send. The fill * fraction controls the frequency with which threads have to be cv_signaled. @@ -91,19 +91,19 @@ int zfs_send_no_prefetch_queue_length = 1024 * 1024; * down. If the queues empty before the signalled thread can catch up, then * these should be tuned up. */ -int zfs_send_queue_ff = 20; -int zfs_send_no_prefetch_queue_ff = 20; +static uint_t zfs_send_queue_ff = 20; +static uint_t zfs_send_no_prefetch_queue_ff = 20; /* * Use this to override the recordsize calculation for fast zfs send estimates. */ -int zfs_override_estimate_recordsize = 0; +static uint_t zfs_override_estimate_recordsize = 0; /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ -int zfs_send_set_freerecords_bit = B_TRUE; +static const boolean_t zfs_send_set_freerecords_bit = B_TRUE; /* Set this tunable to FALSE is disable sending unmodified spill blocks. */ -int zfs_send_unmodified_spill_blocks = B_TRUE; +static int zfs_send_unmodified_spill_blocks = B_TRUE; static inline boolean_t overflow_multiply(uint64_t a, uint64_t b, uint64_t *c) @@ -165,6 +165,7 @@ struct send_range { kmutex_t lock; kcondvar_t cv; boolean_t io_outstanding; + boolean_t io_compressed; int io_err; } data; struct srh { @@ -378,7 +379,7 @@ dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, } } /* create a FREE record and make it pending */ - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; @@ -437,7 +438,7 @@ dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, } } /* create a REDACT record and make it pending */ - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_REDACT; drrr->drr_object = object; drrr->drr_offset = offset; @@ -450,7 +451,8 @@ dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, static int dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, - uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) + uint64_t offset, int lsize, int psize, const blkptr_t *bp, + boolean_t io_compressed, void *data) { uint64_t payload_size; boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); @@ -478,7 +480,7 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, dscp->dsc_pending_op = PENDING_NONE; } /* write a WRITE record */ - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; @@ -487,7 +489,11 @@ dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, drrw->drr_logical_size = lsize; /* only set the compression fields if the buf is compressed or raw */ - if (raw || lsize != psize) { + boolean_t compressed = + (bp != NULL ? BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && + io_compressed : lsize != psize); + if (raw || compressed) { + ASSERT(bp != NULL); ASSERT(raw || dscp->dsc_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); @@ -566,7 +572,7 @@ dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, ASSERT(BP_IS_EMBEDDED(bp)); - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; @@ -579,7 +585,13 @@ dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, decode_embedded_bp_compressed(bp, buf); - if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + uint32_t psize = drrw->drr_psize; + uint32_t rsize = P2ROUNDUP(psize, 8); + + if (psize != rsize) + memset(buf + psize, 0, rsize - psize); + + if (dump_record(dscp, buf, rsize) != 0) return (SET_ERROR(EINTR)); return (0); } @@ -599,7 +611,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, } /* write a SPILL record */ - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; @@ -607,7 +619,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, /* See comment in dump_dnode() for full details */ if (zfs_send_unmodified_spill_blocks && - (bp->blk_birth <= dscp->dsc_fromtxg)) { + (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { drrs->drr_flags |= DRR_SPILL_UNMODIFIED; } @@ -681,7 +693,7 @@ dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs) } /* write a FREEOBJECTS record */ - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; @@ -722,7 +734,7 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, } /* write an OBJECT record */ - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; @@ -758,6 +770,8 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, * to send it. */ if (bonuslen != 0) { + if (drro->drr_bonuslen > DN_MAX_BONUS_LEN(dnp)) + return (SET_ERROR(EINVAL)); drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp); bonuslen = drro->drr_raw_bonuslen; } @@ -790,11 +804,11 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, */ if (zfs_send_unmodified_spill_blocks && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && - (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) { + (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { struct send_range record; blkptr_t *bp = DN_SPILL_BLKPTR(dnp); - bzero(&record, sizeof (struct send_range)); + memset(&record, 0, sizeof (struct send_range)); record.type = DATA; record.object = object; record.eos_marker = B_FALSE; @@ -834,7 +848,7 @@ dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp, dscp->dsc_pending_op = PENDING_NONE; } - bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + memset(dscp->dsc_drr, 0, sizeof (dmu_replay_record_t)); dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE; drror->drr_firstobj = firstobj; drror->drr_numslots = numslots; @@ -927,7 +941,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); if (BP_GET_TYPE(bp) == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + zio_flag_t zioflags = ZIO_FLAG_CANFAIL; if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); @@ -1014,7 +1028,8 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) int n = MIN(srdp->datablksz, SPA_OLD_MAXBLOCKSIZE); err = dmu_dump_write(dscp, srdp->obj_type, - range->object, offset, n, n, NULL, data); + range->object, offset, n, n, NULL, B_FALSE, + data); offset += n; /* * When doing dry run, data==NULL is used as a @@ -1028,7 +1043,8 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) } else { err = dmu_dump_write(dscp, srdp->obj_type, range->object, offset, - srdp->datablksz, srdp->datasz, bp, data); + srdp->datablksz, srdp->datasz, bp, + srdp->io_compressed, data); } return (err); } @@ -1081,6 +1097,7 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid, cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL); range->sru.data.io_outstanding = 0; range->sru.data.io_err = 0; + range->sru.data.io_compressed = B_FALSE; } return (range); } @@ -1089,11 +1106,11 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid, * This is the callback function to traverse_dataset that acts as a worker * thread for dmu_send_impl. */ -/*ARGSUSED*/ static int send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { + (void) zilog; struct send_thread_arg *sta = arg; struct send_range *record; @@ -1106,9 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { - spa_log_error(spa, zb); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(sta->os)); + spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EIO)); } @@ -1126,7 +1141,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, record->sru.object.bp = *bp; size_t size = sizeof (*dnp) * (dnp->dn_extra_slots + 1); record->sru.object.dnp = kmem_alloc(size, KM_SLEEP); - bcopy(dnp, record->sru.object.dnp, size); + memcpy(record->sru.object.dnp, dnp, size); bqueue_enqueue(&sta->q, record, sizeof (*record)); return (0); } @@ -1224,7 +1239,7 @@ redact_list_cb(redact_block_phys_t *rb, void *arg) * error code of the thread in case something goes wrong, and pushes the End of * Stream record when the traverse_dataset call has finished. */ -static void +static __attribute__((noreturn)) void send_traverse_thread(void *arg) { struct send_thread_arg *st_arg = arg; @@ -1314,7 +1329,7 @@ get_next_range(bqueue_t *bq, struct send_range *prev) return (next); } -static void +static __attribute__((noreturn)) void redact_list_thread(void *arg) { struct redact_list_thread_arg *rlt_arg = arg; @@ -1509,7 +1524,7 @@ find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask) * data from the redact_list_thread and use that to determine which blocks * should be redacted. */ -static void +static __attribute__((noreturn)) void send_merge_thread(void *arg) { struct send_merge_thread_arg *smt_arg = arg; @@ -1576,8 +1591,6 @@ send_merge_thread(void *arg) } range_free(front_ranges[i]); } - if (range == NULL) - range = kmem_zalloc(sizeof (*range), KM_SLEEP); range->eos_marker = B_TRUE; bqueue_enqueue_flush(&smt_arg->q, range, 1); spl_fstrans_unmark(cookie); @@ -1644,12 +1657,15 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + zio_flag_t zioflags = ZIO_FLAG_CANFAIL; - if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) + if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) { zioflags |= ZIO_FLAG_RAW; - else if (request_compressed) + srdp->io_compressed = B_TRUE; + } else if (request_compressed) { zioflags |= ZIO_FLAG_RAW_COMPRESS; + srdp->io_compressed = B_TRUE; + } srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp); @@ -1701,8 +1717,10 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, struct send_range *range = range_alloc(range_type, dn->dn_object, blkid, blkid + count, B_FALSE); - if (blkid == DMU_SPILL_BLKID) + if (blkid == DMU_SPILL_BLKID) { + ASSERT3P(bp, !=, NULL); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); + } switch (range_type) { case HOLE: @@ -1731,7 +1749,7 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, * some indirect blocks can be discarded because they're not holes. Second, * it issues prefetches for the data we need to send. */ -static void +static __attribute__((noreturn)) void send_reader_thread(void *arg) { struct send_reader_thread_arg *srta = arg; @@ -1823,8 +1841,7 @@ send_reader_thread(void *arg) continue; } uint64_t file_max = - (dn->dn_maxblkid < range->end_blkid ? - dn->dn_maxblkid : range->end_blkid); + MIN(dn->dn_maxblkid, range->end_blkid); /* * The object exists, so we need to try to find the * blkptr for each block in the range we're processing. @@ -1900,7 +1917,7 @@ send_reader_thread(void *arg) struct dmu_send_params { /* Pool args */ - void *tag; // Tag that dp was held with, will be used to release dp. + const void *tag; // Tag dp was held with, will be used to release dp. dsl_pool_t *dp; /* To snapshot args */ const char *tosnap; @@ -1936,7 +1953,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, { dsl_dataset_t *to_ds = dspp->to_ds; dsl_pool_t *dp = dspp->dp; -#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) @@ -1945,7 +1962,6 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, if (version >= ZPL_VERSION_SA) *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } -#endif /* raw sends imply large_block_ok */ if ((dspp->rawok || dspp->large_block_ok) && @@ -2144,6 +2160,7 @@ setup_resume_points(struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os, redaction_list_t *redact_rl, nvlist_t *nvl) { + (void) smt_arg; dsl_dataset_t *to_ds = dspp->to_ds; int err = 0; @@ -2348,7 +2365,7 @@ dmu_send_impl(struct dmu_send_params *dspp) dsl_dataset_t *to_ds = dspp->to_ds; zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb; dsl_pool_t *dp = dspp->dp; - void *tag = dspp->tag; + const void *tag = dspp->tag; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { @@ -2497,8 +2514,7 @@ dmu_send_impl(struct dmu_send_params *dspp) } if (featureflags & DMU_BACKUP_FEATURE_RAW) { - uint64_t ivset_guid = (ancestor_zb != NULL) ? - ancestor_zb->zbm_ivset_guid : 0; + uint64_t ivset_guid = ancestor_zb->zbm_ivset_guid; nvlist_t *keynvl = NULL; ASSERT(os->os_encrypted); @@ -2536,7 +2552,7 @@ dmu_send_impl(struct dmu_send_params *dspp) while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); - if (issig(JUSTLOOKING) && issig(FORREAL)) + if (issig()) err = SET_ERROR(EINTR); } @@ -2583,7 +2599,7 @@ dmu_send_impl(struct dmu_send_params *dspp) * the receive side that the stream is incomplete. */ if (!dspp->savedok) { - bzero(drr, sizeof (dmu_replay_record_t)); + memset(drr, 0, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc; drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid; @@ -2684,7 +2700,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, uint64_t size = dspp.numfromredactsnaps * sizeof (uint64_t); dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP); - bcopy(fromredact, dspp.fromredactsnaps, size); + memcpy(dspp.fromredactsnaps, fromredact, size); } boolean_t is_before = @@ -2702,6 +2718,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; err = dmu_send_impl(&dspp); } + if (dspp.fromredactsnaps) + kmem_free(dspp.fromredactsnaps, + dspp.numfromredactsnaps * sizeof (uint64_t)); + dsl_dataset_rele(dspp.to_ds, FTAG); return (err); } @@ -2770,6 +2790,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } if (err == 0) { + owned = B_TRUE; err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, @@ -2783,21 +2804,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, sizeof (dspp.saved_toname), dspp.saved_toname); } - if (err != 0) + /* Only disown if there was an error in the lookups */ + if (owned && (err != 0)) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); kmem_strfree(name); } else { err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); + if (err == 0) + owned = B_TRUE; } - owned = B_TRUE; } else { err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err != 0) { + /* Note: dsl dataset is not owned at this point */ dsl_pool_rele(dspp.dp, FTAG); return (err); } @@ -2869,7 +2893,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, sizeof (uint64_t); dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP); - bcopy(fromredact, dspp.fromredactsnaps, + memcpy(dspp.fromredactsnaps, fromredact, size); } if (!dsl_dataset_is_before(dspp.to_ds, fromds, @@ -2910,6 +2934,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, /* dmu_send_impl will call dsl_pool_rele for us. */ err = dmu_send_impl(&dspp); } else { + if (dspp.fromredactsnaps) + kmem_free(dspp.fromredactsnaps, + dspp.numfromredactsnaps * + sizeof (uint64_t)); dsl_pool_rele(dspp.dp, FTAG); } } else { @@ -3002,7 +3030,7 @@ dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds, dsl_dataset_name(origds, dsname); (void) strcat(dsname, "/"); - (void) strcat(dsname, recv_clone_name); + (void) strlcat(dsname, recv_clone_name, sizeof (dsname)); err = dsl_dataset_hold(origds->ds_dir->dd_pool, dsname, FTAG, &ds); @@ -3072,25 +3100,23 @@ out: return (err); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW, "Allow sending corrupt data"); -ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, UINT, ZMOD_RW, "Maximum send queue length"); ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW, "Send unmodified spill blocks"); -ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, UINT, ZMOD_RW, "Maximum send queue length for non-prefetch queues"); -ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, UINT, ZMOD_RW, "Send queue fill fraction"); -ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, UINT, ZMOD_RW, "Send queue fill fraction for non-prefetch queues"); -ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, UINT, ZMOD_RW, "Override block size estimate with fixed size"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c index 862c0bf404ad..15cc2885e805 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_traverse.c +++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -39,9 +39,9 @@ #include <sys/callb.h> #include <sys/zfeature.h> -int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ -int32_t send_holes_without_birth_time = 1; -int32_t zfs_traverse_indirect_prefetch_limit = 32; +static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ +static int32_t send_holes_without_birth_time = 1; +static uint_t zfs_traverse_indirect_prefetch_limit = 32; typedef struct prefetch_data { kmutex_t pd_mtx; @@ -83,7 +83,8 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, if (BP_IS_HOLE(bp)) return (0); - if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) + if (claim_txg == 0 && + BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa)) return (-1); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, @@ -108,9 +109,10 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, if (BP_IS_HOLE(bp)) return (0); - if (claim_txg == 0 || bp->blk_birth < claim_txg) + if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) return (0); + ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); @@ -153,10 +155,10 @@ typedef enum resume_skip { * Otherwise returns RESUME_SKIP_NONE. */ static resume_skip_t -resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, +resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { - if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + if (td->td_resume != NULL) { /* * If we already visited this bp & everything below, * don't bother doing it again. @@ -164,12 +166,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); - /* - * If we found the block we're trying to resume from, zero - * the bookmark out to indicate that we have resumed. - */ - if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { - bzero(td->td_resume, sizeof (*zb)); + if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) { if (td->td_flags & TRAVERSE_POST) return (RESUME_SKIP_CHILDREN); } @@ -181,22 +178,22 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE. */ static boolean_t -traverse_prefetch_metadata(traverse_data_t *td, +traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return (B_FALSE); /* - * If we are in the process of resuming, don't prefetch, because - * some children will not be needed (and in fact may have already - * been freed). + * If this bp is before the resume point, it may have already been + * freed. */ - if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) + if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE) return (B_FALSE); - if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) + if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return (B_FALSE); @@ -239,7 +236,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, ASSERT(0); } - if (bp->blk_birth == 0) { + if (BP_GET_LOGICAL_BIRTH(bp) == 0) { /* * Since this block has a birth time of 0 it must be one of * two things: a hole created before the @@ -267,7 +264,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_object == DMU_META_DNODE_OBJECT) && td->td_hole_birth_enabled_txg <= td->td_min_txg) return (0); - } else if (bp->blk_birth <= td->td_min_txg) { + } else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) { return (0); } @@ -342,7 +339,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + pidx); - if (traverse_prefetch_metadata(td, + if (traverse_prefetch_metadata(td, dnp, &((blkptr_t *)buf->b_data)[pidx], czb) == B_TRUE) { prefetched++; @@ -504,12 +501,12 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); + traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); + traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); } } @@ -560,11 +557,11 @@ traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp, return (err); } -/* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { + (void) zilog, (void) dnp; prefetch_data_t *pfd = arg; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | @@ -670,7 +667,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; @@ -809,11 +806,10 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags, EXPORT_SYMBOL(traverse_dataset); EXPORT_SYMBOL(traverse_pool); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, "Max number of bytes to prefetch"); -ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW, "Traverse prefetch number of blocks pointed by indirect block"); #if defined(_KERNEL) @@ -822,6 +818,6 @@ MODULE_PARM_DESC(ignore_hole_birth, "Alias for send_holes_without_birth_time"); #endif +/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW, "Ignore hole_birth txg for zfs send"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index 5fa516866668..8451b5082e86 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, - { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -210,16 +210,22 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, level, blkid, FTAG); + err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); - if (db == NULL) - return (SET_ERROR(EIO)); - err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); + /* + * PARTIAL_FIRST allows caching for uncacheable blocks. It will + * be cleared after dmu_buf_will_dirty() call dbuf_read() again. + */ + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH | + (level == 0 ? DB_RF_PARTIAL_FIRST : 0)); dbuf_rele(db, FTAG); return (err); } -/* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { @@ -291,6 +297,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } static void +dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + int err = 0; + + if (len == 0) + return; + + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); + + if (dn == NULL) + return; + + /* + * For i/o error checking, read the blocks that will be needed + * to perform the append; first level-0 block (if not aligned, i.e. + * if they are partial-block writes), no additional blocks are read. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } +} + +static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { (void) zfs_refcount_add_many(&txh->txh_space_towrite, @@ -331,6 +384,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) } /* + * Should be used when appending to an object and the exact offset is unknown. + * The write must occur at or beyond the specified offset. Only the L0 block + * at provided offset will be prefetched. + */ +void +dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +void +dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +/* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and * this transaction will be able to use half of the pool space overhead @@ -345,7 +434,7 @@ dmu_tx_mark_netfree(dmu_tx_t *tx) } static void -dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; @@ -353,15 +442,11 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) ASSERT(tx->tx_txg == 0); - dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; - dmu_tx_count_dnode(txh); - /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. @@ -441,8 +526,10 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); - if (txh != NULL) - (void) dmu_tx_hold_free_impl(txh, off, len); + if (txh != NULL) { + dmu_tx_count_dnode(txh); + dmu_tx_count_free(txh, off, len); + } } void @@ -451,8 +538,35 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) dmu_tx_hold_t *txh; txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); - if (txh != NULL) - (void) dmu_tx_hold_free_impl(txh, off, len); + if (txh != NULL) { + dmu_tx_count_dnode(txh); + dmu_tx_count_free(txh, off, len); + } +} + +static void +dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + + /* + * Reuse dmu_tx_count_free(), it does exactly what we need for clone. + */ + dmu_tx_count_free(txh, off, len); +} + +void +dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len); + if (txh != NULL) { + dmu_tx_count_dnode(txh); + dmu_tx_count_clone(txh, off, len); + } } static void @@ -461,6 +575,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) dmu_tx_t *tx = txh->txh_tx; dnode_t *dn = txh->txh_dnode; int err; + extern int zap_micro_max_size; ASSERT(tx->tx_txg == 0); @@ -476,7 +591,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) * - 2 grown ptrtbl blocks */ (void) zfs_refcount_add_many(&txh->txh_space_towrite, - MZAP_MAX_BLKSZ, FTAG); + zap_micro_max_size, FTAG); if (dn == NULL) return; @@ -638,6 +753,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) if (blkid == 0) match_offset = TRUE; break; + case THT_APPEND: + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + + /* + * THT_WRITE used for bonus and spill blocks. + */ + ASSERT(blkid != DMU_BONUS_BLKID && + blkid != DMU_SPILL_BLKID); + + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; case THT_FREE: /* * We will dirty all the level 1 blocks in @@ -662,6 +797,10 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) case THT_NEWOBJECT: match_object = TRUE; break; + case THT_CLONE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + break; default: cmn_err(CE_PANIC, "bad txh_type %d", txh->txh_type); @@ -683,8 +822,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) * If we can't do 10 iops, something is wrong. Let us go ahead * and hit zfs_dirty_data_max. */ -hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ -int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ +static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ /* * We delay transactions when we've determined that the backend storage @@ -781,34 +919,49 @@ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; - uint64_t delay_min_bytes = + uint64_t delay_min_bytes, wrlog; + hrtime_t wakeup, tx_time = 0, now; + + /* Calculate minimum transaction time for the dirty data amount. */ + delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + if (dirty > delay_min_bytes) { + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which + * could cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); - if (dirty <= delay_min_bytes) - return; + tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / + (zfs_dirty_data_max - dirty); + } - /* - * The caller has already waited until we are under the max. - * We make them pass us the amount of dirty data so we don't - * have to handle the case of it being >= the max, which could - * cause a divide-by-zero if it's == the max. - */ - ASSERT3U(dirty, <, zfs_dirty_data_max); + /* Calculate minimum transaction time for the TX_WRITE log size. */ + wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); + delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + if (wrlog >= zfs_wrlog_data_max) { + tx_time = zfs_delay_max_ns; + } else if (wrlog > delay_min_bytes) { + tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / + (zfs_wrlog_data_max - wrlog), tx_time); + } + + if (tx_time == 0) + return; + tx_time = MIN(tx_time, zfs_delay_max_ns); now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + if (now > tx->tx_start + tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, - uint64_t, min_tx_time); + uint64_t, tx_time); mutex_enter(&dp->dp_lock); - wakeup = MAX(tx->tx_start + min_tx_time, - dp->dp_last_wakeup + min_tx_time); + wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); @@ -886,8 +1039,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) } if (!tx->tx_dirty_delayed && - dsl_pool_wrlog_over_max(tx->tx_pool)) { - DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); + dsl_pool_need_wrlog_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); return (SET_ERROR(ERESTART)); } @@ -1244,8 +1398,7 @@ dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; - while ((dcb = list_tail(cb_list)) != NULL) { - list_remove(cb_list, dcb); + while ((dcb = list_remove_tail(cb_list)) != NULL) { dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } @@ -1405,6 +1558,8 @@ dmu_tx_fini(void) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_append); +EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 043344a1375f..ed50f1889b59 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -28,6 +28,7 @@ */ #include <sys/zfs_context.h> +#include <sys/arc_impl.h> #include <sys/dnode.h> #include <sys/dmu_objset.h> #include <sys/dmu_zfetch.h> @@ -43,38 +44,63 @@ * so it can't hurt performance. */ -int zfs_prefetch_disable = B_FALSE; +static int zfs_prefetch_disable = B_FALSE; /* max # of streams per zfetch */ -unsigned int zfetch_max_streams = 8; +static unsigned int zfetch_max_streams = 8; /* min time before stream reclaim */ -unsigned int zfetch_min_sec_reap = 2; +static unsigned int zfetch_min_sec_reap = 1; +/* max time before stream delete */ +static unsigned int zfetch_max_sec_reap = 2; +#ifdef _ILP32 +/* min bytes to prefetch per stream (default 2MB) */ +static unsigned int zfetch_min_distance = 2 * 1024 * 1024; /* max bytes to prefetch per stream (default 8MB) */ unsigned int zfetch_max_distance = 8 * 1024 * 1024; +#else +/* min bytes to prefetch per stream (default 4MB) */ +static unsigned int zfetch_min_distance = 4 * 1024 * 1024; +/* max bytes to prefetch per stream (default 64MB) */ +unsigned int zfetch_max_distance = 64 * 1024 * 1024; +#endif /* max bytes to prefetch indirects for per stream (default 64MB) */ unsigned int zfetch_max_idistance = 64 * 1024 * 1024; -/* max number of bytes in an array_read in which we allow prefetching (1MB) */ -unsigned long zfetch_array_rd_sz = 1024 * 1024; +/* max request reorder distance within a stream (default 16MB) */ +unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +/* Max log2 fraction of holes in a stream */ +unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; + kstat_named_t zfetchstat_future; + kstat_named_t zfetchstat_stride; + kstat_named_t zfetchstat_past; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; kstat_named_t zfetchstat_io_issued; + kstat_named_t zfetchstat_io_active; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, + { "future", KSTAT_DATA_UINT64 }, + { "stride", KSTAT_DATA_UINT64 }, + { "past", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, - { "io_issued", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, + { "io_active", KSTAT_DATA_UINT64 }, }; struct { wmsum_t zfetchstat_hits; + wmsum_t zfetchstat_future; + wmsum_t zfetchstat_stride; + wmsum_t zfetchstat_past; wmsum_t zfetchstat_misses; wmsum_t zfetchstat_max_streams; wmsum_t zfetchstat_io_issued; + aggsum_t zfetchstat_io_active; } zfetch_sums; #define ZFETCHSTAT_BUMP(stat) \ @@ -83,7 +109,7 @@ struct { wmsum_add(&zfetch_sums.stat, val) -kstat_t *zfetch_ksp; +static kstat_t *zfetch_ksp; static int zfetch_kstats_update(kstat_t *ksp, int rw) @@ -94,12 +120,20 @@ zfetch_kstats_update(kstat_t *ksp, int rw) return (EACCES); zs->zfetchstat_hits.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_hits); + zs->zfetchstat_future.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_future); + zs->zfetchstat_stride.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_stride); + zs->zfetchstat_past.value.ui64 = + wmsum_value(&zfetch_sums.zfetchstat_past); zs->zfetchstat_misses.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_misses); zs->zfetchstat_max_streams.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_max_streams); zs->zfetchstat_io_issued.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_io_issued); + zs->zfetchstat_io_active.value.ui64 = + aggsum_value(&zfetch_sums.zfetchstat_io_active); return (0); } @@ -107,9 +141,13 @@ void zfetch_init(void) { wmsum_init(&zfetch_sums.zfetchstat_hits, 0); + wmsum_init(&zfetch_sums.zfetchstat_future, 0); + wmsum_init(&zfetch_sums.zfetchstat_stride, 0); + wmsum_init(&zfetch_sums.zfetchstat_past, 0); wmsum_init(&zfetch_sums.zfetchstat_misses, 0); wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); + aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), @@ -131,9 +169,14 @@ zfetch_fini(void) } wmsum_fini(&zfetch_sums.zfetchstat_hits); + wmsum_fini(&zfetch_sums.zfetchstat_future); + wmsum_fini(&zfetch_sums.zfetchstat_stride); + wmsum_fini(&zfetch_sums.zfetchstat_past); wmsum_fini(&zfetch_sums.zfetchstat_misses); wmsum_fini(&zfetch_sums.zfetchstat_max_streams); wmsum_fini(&zfetch_sums.zfetchstat_io_issued); + ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); + aggsum_fini(&zfetch_sums.zfetchstat_io_active); } /* @@ -195,75 +238,219 @@ dmu_zfetch_fini(zfetch_t *zf) } /* - * If there aren't too many streams already, create a new stream. + * If there aren't too many active streams already, create one more. + * In process delete/reuse all streams without hits for zfetch_max_sec_reap. + * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. * The "blkid" argument is the next block that we expect this stream to access. - * While we're here, clean up old streams (which haven't been - * accessed for at least zfetch_min_sec_reap seconds). */ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { - zstream_t *zs_next; - hrtime_t now = gethrtime(); + zstream_t *zs, *zs_next, *zs_old = NULL; + uint_t now = gethrestime_sec(), t; ASSERT(MUTEX_HELD(&zf->zf_lock)); /* - * Clean up old streams. + * Delete too old streams, reusing the first found one. */ - for (zstream_t *zs = list_head(&zf->zf_stream); - zs != NULL; zs = zs_next) { + t = now - zfetch_max_sec_reap; + for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); /* * Skip if still active. 1 -- zf_stream reference. */ + if ((int)(zs->zs_atime - t) >= 0) + continue; if (zfs_refcount_count(&zs->zs_refs) != 1) continue; - if (((now - zs->zs_atime) / NANOSEC) > - zfetch_min_sec_reap) + if (zs_old) dmu_zfetch_stream_remove(zf, zs); + else + zs_old = zs; + } + if (zs_old) { + zs = zs_old; + list_remove(&zf->zf_stream, zs); + goto reuse; } /* * The maximum number of streams is normally zfetch_max_streams, * but for small files we lower it such that it's at least possible * for all the streams to be non-overlapping. - * - * If we are already at the maximum number of streams for this file, - * even after removing old streams, then don't create this stream. */ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, - zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / + (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) / zfetch_max_distance)); if (zf->zf_numstreams >= max_streams) { + t = now - zfetch_min_sec_reap; + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if ((int)(zs->zs_atime - t) >= 0) + continue; + if (zfs_refcount_count(&zs->zs_refs) != 1) + continue; + if (zs_old == NULL || + (int)(zs_old->zs_atime - zs->zs_atime) >= 0) + zs_old = zs; + } + if (zs_old) { + zs = zs_old; + list_remove(&zf->zf_stream, zs); + goto reuse; + } ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } - zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); - zs->zs_blkid = blkid; - zs->zs_pf_blkid1 = blkid; - zs->zs_pf_blkid = blkid; - zs->zs_ipf_blkid1 = blkid; - zs->zs_ipf_blkid = blkid; - zs->zs_atime = now; - zs->zs_fetch = zf; - zs->zs_missed = B_FALSE; + zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); zfs_refcount_create(&zs->zs_callers); zfs_refcount_create(&zs->zs_refs); /* One reference for zf_stream. */ zfs_refcount_add(&zs->zs_refs, NULL); zf->zf_numstreams++; + +reuse: list_insert_head(&zf->zf_stream, zs); + zs->zs_blkid = blkid; + /* Allow immediate stream reuse until first hit. */ + zs->zs_atime = now - zfetch_min_sec_reap; + memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges)); + zs->zs_pf_dist = 0; + zs->zs_ipf_dist = 0; + zs->zs_pf_start = blkid; + zs->zs_pf_end = blkid; + zs->zs_ipf_start = blkid; + zs->zs_ipf_end = blkid; + zs->zs_missed = B_FALSE; + zs->zs_more = B_FALSE; } static void -dmu_zfetch_stream_done(void *arg, boolean_t io_issued) +dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) { zstream_t *zs = arg; + if (io_issued && level == 0 && blkid < zs->zs_blkid) + zs->zs_more = B_TRUE; if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); + aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); +} + +/* + * Process stream hit access for nblks blocks starting at zs_blkid. Return + * number of blocks to proceed for after aggregation with future ranges. + */ +static uint64_t +dmu_zfetch_hit(zstream_t *zs, uint64_t nblks) +{ + uint_t i, j; + + /* Optimize sequential accesses (no future ranges). */ + if (zs->zs_ranges[0].start == 0) + goto done; + + /* Look for intersections with further ranges. */ + for (i = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0 || r->start > nblks) + break; + if (r->end >= nblks) { + nblks = r->end; + i++; + break; + } + } + + /* Delete all found intersecting ranges, updates remaining. */ + for (j = 0; i < ZFETCH_RANGES; i++, j++) { + if (zs->zs_ranges[i].start == 0) + break; + ASSERT3U(zs->zs_ranges[i].start, >, nblks); + ASSERT3U(zs->zs_ranges[i].end, >, nblks); + zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks; + zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks; + } + if (j < ZFETCH_RANGES) { + zs->zs_ranges[j].start = 0; + zs->zs_ranges[j].end = 0; + } + +done: + zs->zs_blkid += nblks; + return (nblks); +} + +/* + * Process future stream access for nblks blocks starting at blkid. Return + * number of blocks to proceed for if future ranges reach fill threshold. + */ +static uint64_t +dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) +{ + ASSERT3U(blkid, >, zs->zs_blkid); + blkid -= zs->zs_blkid; + ASSERT3U(blkid + nblks, <=, UINT16_MAX); + + /* Search for first and last intersection or insert point. */ + uint_t f = ZFETCH_RANGES, l = 0, i; + for (i = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0 || r->start > blkid + nblks) + break; + if (r->end < blkid) + continue; + if (f > i) + f = i; + if (l < i) + l = i; + } + if (f <= l) { + /* Got some intersecting range, expand it if needed. */ + if (zs->zs_ranges[f].start > blkid) + zs->zs_ranges[f].start = blkid; + zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks); + if (f < l) { + /* Got more than one intersection, remove others. */ + for (f++, l++; l < ZFETCH_RANGES; f++, l++) { + zs->zs_ranges[f].start = zs->zs_ranges[l].start; + zs->zs_ranges[f].end = zs->zs_ranges[l].end; + } + zs->zs_ranges[f].start = 0; + zs->zs_ranges[f].end = 0; + } + } else if (i < ZFETCH_RANGES) { + /* Got no intersecting ranges, insert new one. */ + for (l = ZFETCH_RANGES - 1; l > i; l--) { + zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start; + zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end; + } + zs->zs_ranges[i].start = blkid; + zs->zs_ranges[i].end = blkid + nblks; + } else { + /* No space left to insert. Drop the range. */ + return (0); + } + + /* Check if with the new access addition we reached fill threshold. */ + if (zfetch_hole_shift >= 16) + return (0); + uint_t hole = 0; + for (i = f = l = 0; i < ZFETCH_RANGES; i++) { + zsrange_t *r = &zs->zs_ranges[i]; + if (r->start == 0) + break; + hole += r->start - f; + f = r->end; + if (hole <= r->end >> zfetch_hole_shift) + l = r->end; + } + if (l > 0) + return (dmu_zfetch_hit(zs, l)); + + return (0); } /* @@ -283,15 +470,15 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, boolean_t have_lock) { zstream_t *zs; - int64_t pf_start, ipf_start; - int64_t pf_ahead_blks, max_blks; - int max_dist_blks, pf_nblks, ipf_nblks; - uint64_t end_of_access_blkid, maxblkid; - end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; + zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; - if (zfs_prefetch_disable) + if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) return (NULL); + + if (os_prefetch == ZFS_PREFETCH_METADATA) + fetch_data = B_FALSE; + /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on @@ -316,7 +503,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, * A fast path for small files for which no prefetch will * happen. */ - maxblkid = zf->zf_dnode->dn_maxblkid; + uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; if (maxblkid < 2) { if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); @@ -325,115 +512,147 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, mutex_enter(&zf->zf_lock); /* - * Find matching prefetch stream. Depending on whether the accesses + * Find perfect prefetch stream. Depending on whether the accesses * are block-aligned, first block of the new access may either follow * the last block of the previous access, or be equal to it. */ + unsigned int dbs = zf->zf_dnode->dn_datablkshift; + uint64_t end_blkid = blkid + nblks; for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { if (blkid == zs->zs_blkid) { - break; + goto hit; } else if (blkid + 1 == zs->zs_blkid) { blkid++; nblks--; - break; + goto hit; } } /* - * If the file is ending, remove the matching stream if found. - * If not found then it is too late to create a new one now. + * Find close enough prefetch stream. Access crossing stream position + * is a hit in its new part. Access ahead of stream position considered + * a hit for metadata prefetch, since we do not care about fill percent, + * or stored for future otherwise. Access behind stream position is + * silently ignored, since we already skipped it reaching fill percent. */ - if (end_of_access_blkid >= maxblkid) { - if (zs != NULL) - dmu_zfetch_stream_remove(zf, zs); - mutex_exit(&zf->zf_lock); - if (!have_lock) - rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return (NULL); + uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX); + uint_t t = gethrestime_sec() - zfetch_max_sec_reap; + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if (blkid > zs->zs_blkid) { + if (end_blkid <= zs->zs_blkid + max_reorder) { + if (!fetch_data) { + nblks = dmu_zfetch_hit(zs, + end_blkid - zs->zs_blkid); + ZFETCHSTAT_BUMP(zfetchstat_stride); + goto future; + } + nblks = dmu_zfetch_future(zs, blkid, nblks); + if (nblks > 0) + ZFETCHSTAT_BUMP(zfetchstat_stride); + else + ZFETCHSTAT_BUMP(zfetchstat_future); + goto future; + } + } else if (end_blkid >= zs->zs_blkid) { + nblks -= zs->zs_blkid - blkid; + blkid += zs->zs_blkid - blkid; + goto hit; + } else if (end_blkid + max_reorder > zs->zs_blkid && + (int)(zs->zs_atime - t) >= 0) { + ZFETCHSTAT_BUMP(zfetchstat_past); + zs->zs_atime = gethrestime_sec(); + goto out; + } } - /* Exit if we already prefetched this block before. */ - if (nblks == 0) { - mutex_exit(&zf->zf_lock); - if (!have_lock) - rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return (NULL); - } + /* + * This access is not part of any existing stream. Create a new + * stream for it unless we are at the end of file. + */ + if (end_blkid < maxblkid) + dmu_zfetch_stream_create(zf, end_blkid); + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + ZFETCHSTAT_BUMP(zfetchstat_misses); + return (NULL); - if (zs == NULL) { - /* - * This access is not part of any existing stream. Create - * a new stream for it. - */ - dmu_zfetch_stream_create(zf, end_of_access_blkid); +hit: + nblks = dmu_zfetch_hit(zs, nblks); + ZFETCHSTAT_BUMP(zfetchstat_hits); + +future: + zs->zs_atime = gethrestime_sec(); + + /* Exit if we already prefetched for this position before. */ + if (nblks == 0) + goto out; + + /* If the file is ending, remove the stream. */ + end_blkid = zs->zs_blkid; + if (end_blkid >= maxblkid) { + dmu_zfetch_stream_remove(zf, zs); +out: mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - ZFETCHSTAT_BUMP(zfetchstat_misses); return (NULL); } /* * This access was to a block that we issued a prefetch for on - * behalf of this stream. Issue further prefetches for this stream. + * behalf of this stream. Calculate further prefetch distances. * - * Normally, we start prefetching where we stopped - * prefetching last (zs_pf_blkid). But when we get our first - * hit on this stream, zs_pf_blkid == zs_blkid, we don't - * want to prefetch the block we just accessed. In this case, - * start just after the block we just accessed. - */ - pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); - if (zs->zs_pf_blkid1 < end_of_access_blkid) - zs->zs_pf_blkid1 = end_of_access_blkid; - if (zs->zs_ipf_blkid1 < end_of_access_blkid) - zs->zs_ipf_blkid1 = end_of_access_blkid; - - /* - * Double our amount of prefetched data, but don't let the - * prefetch get further ahead than zfetch_max_distance. + * Start prefetch from the demand access size (nblks). Double the + * distance every access up to zfetch_min_distance. After that only + * if needed increase the distance by 1/8 up to zfetch_max_distance. + * + * Don't double the distance beyond single block if we have more + * than ~6% of ARC held by active prefetches. It should help with + * getting out of RAM on some badly mispredicted read patterns. */ + unsigned int nbytes = nblks << dbs; + unsigned int pf_nblks; if (fetch_data) { - max_dist_blks = - zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; - /* - * Previously, we were (zs_pf_blkid - blkid) ahead. We - * want to now be double that, so read that amount again, - * plus the amount we are catching up by (i.e. the amount - * read just now). - */ - pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; - max_blks = max_dist_blks - (pf_start - end_of_access_blkid); - pf_nblks = MIN(pf_ahead_blks, max_blks); + if (unlikely(zs->zs_pf_dist < nbytes)) + zs->zs_pf_dist = nbytes; + else if (zs->zs_pf_dist < zfetch_min_distance && + (zs->zs_pf_dist < (1 << dbs) || + aggsum_compare(&zfetch_sums.zfetchstat_io_active, + arc_c_max >> (4 + dbs)) < 0)) + zs->zs_pf_dist *= 2; + else if (zs->zs_more) + zs->zs_pf_dist += zs->zs_pf_dist / 8; + zs->zs_more = B_FALSE; + if (zs->zs_pf_dist > zfetch_max_distance) + zs->zs_pf_dist = zfetch_max_distance; + pf_nblks = zs->zs_pf_dist >> dbs; } else { pf_nblks = 0; } - - zs->zs_pf_blkid = pf_start + pf_nblks; + if (zs->zs_pf_start < end_blkid) + zs->zs_pf_start = end_blkid; + if (zs->zs_pf_end < end_blkid + pf_nblks) + zs->zs_pf_end = end_blkid + pf_nblks; /* - * Do the same for indirects, starting from where we stopped last, - * or where we will stop reading data blocks (and the indirects - * that point to them). + * Do the same for indirects, starting where we will stop reading + * data blocks (and the indirects that point to them). */ - ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); - max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; - /* - * We want to double our distance ahead of the data prefetch - * (or reader, if we are not prefetching data). Previously, we - * were (zs_ipf_blkid - blkid) ahead. To double that, we read - * that amount again, plus the amount we are catching up by - * (i.e. the amount read now + the amount of data prefetched now). - */ - pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; - max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid); - ipf_nblks = MIN(pf_ahead_blks, max_blks); - zs->zs_ipf_blkid = ipf_start + ipf_nblks; - - zs->zs_blkid = end_of_access_blkid; - /* Protect the stream from reclamation. */ - zs->zs_atime = gethrtime(); + if (unlikely(zs->zs_ipf_dist < nbytes)) + zs->zs_ipf_dist = nbytes; + else + zs->zs_ipf_dist *= 2; + if (zs->zs_ipf_dist > zfetch_max_idistance) + zs->zs_ipf_dist = zfetch_max_idistance; + pf_nblks = zs->zs_ipf_dist >> dbs; + if (zs->zs_ipf_start < zs->zs_pf_end) + zs->zs_ipf_start = zs->zs_pf_end; + if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) + zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; + zfs_refcount_add(&zs->zs_refs, NULL); /* Count concurrent callers. */ zfs_refcount_add(&zs->zs_callers, NULL); @@ -441,15 +660,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - - ZFETCHSTAT_BUMP(zfetchstat_hits); return (zs); } void -dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) +dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, + boolean_t have_lock) { - zfetch_t *zf = zs->zs_fetch; int64_t pf_start, pf_end, ipf_start, ipf_end; int epbs, issued; @@ -470,13 +687,13 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) mutex_enter(&zf->zf_lock); if (zs->zs_missed) { - pf_start = zs->zs_pf_blkid1; - pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid; + pf_start = zs->zs_pf_start; + pf_end = zs->zs_pf_start = zs->zs_pf_end; } else { pf_start = pf_end = 0; } - ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1); - ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid; + ipf_start = zs->zs_ipf_start; + ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; mutex_exit(&zf->zf_lock); ASSERT3S(pf_start, <=, pf_end); ASSERT3S(ipf_start, <=, ipf_end); @@ -488,14 +705,14 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = pf_end - pf_start + ipf_end - ipf_start; if (issued > 1) { /* More references on top of taken in dmu_zfetch_prepare(). */ - for (int i = 0; i < issued - 1; i++) - zfs_refcount_add(&zs->zs_refs, NULL); + zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); } else if (issued == 0) { /* Some other thread has done our work, so drop the ref. */ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); return; } + aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); @@ -503,13 +720,11 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = 0; for (int64_t blk = pf_start; blk < pf_end; blk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_stream_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, - dmu_zfetch_stream_done, zs); + ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); } if (!have_lock) @@ -527,10 +742,9 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); if (zs) - dmu_zfetch_run(zs, missed, have_lock); + dmu_zfetch_run(zf, zs, missed, have_lock); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, "Disable all ZFS prefetching"); @@ -540,12 +754,20 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, "Min time before stream reclaim"); +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, + "Max time before stream delete"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, + "Min bytes to prefetch per stream"); + ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, "Max bytes to prefetch per stream"); ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, "Max bytes to prefetch indirects for per stream"); -ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, - "Number of bytes in a array_read"); -/* END CSTYLED */ +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW, + "Max request reorder distance within a stream"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW, + "Max log2 fraction of holes in a stream"); diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index db1a5d71df3c..a703fd414f87 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -71,6 +71,8 @@ dnode_stats_t dnode_stats = { { "dnode_move_active", KSTAT_DATA_UINT64 }, }; +dnode_sums_t dnode_sums; + static kstat_t *dnode_ksp; static kmem_cache_t *dnode_cache; @@ -97,6 +99,14 @@ dbuf_compare(const void *x1, const void *x2) if (likely(cmp)) return (cmp); + if (d1->db_state == DB_MARKER) { + ASSERT3S(d2->db_state, !=, DB_MARKER); + return (TREE_PCMP(d1->db_parent, d2)); + } else if (d2->db_state == DB_MARKER) { + ASSERT3S(d1->db_state, !=, DB_MARKER); + return (TREE_PCMP(d1, d2->db_parent)); + } + if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); return (-1); @@ -108,12 +118,11 @@ dbuf_compare(const void *x1, const void *x2) return (TREE_PCMP(d1, d2)); } -/* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { + (void) unused, (void) kmflag; dnode_t *dn = arg; - int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -129,17 +138,17 @@ dnode_cons(void *arg, void *unused, int kmflag) zfs_refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); - bzero(&dn->dn_next_type[0], sizeof (dn->dn_next_type)); - bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); - bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); - bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); - bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); - bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); - bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); - bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); - bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid)); + memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type)); + memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr)); + memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels)); + memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift)); + memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype)); + memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk)); + memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen)); + memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz)); + memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid)); - for (i = 0; i < TXG_SIZE; i++) { + for (int i = 0; i < TXG_SIZE; i++) { multilist_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], @@ -174,11 +183,10 @@ dnode_cons(void *arg, void *unused, int kmflag) return (0); } -/* ARGSUSED */ static void dnode_dest(void *arg, void *unused) { - int i; + (void) unused; dnode_t *dn = arg; rw_destroy(&dn->dn_struct_rwlock); @@ -190,7 +198,7 @@ dnode_dest(void *arg, void *unused) zfs_refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); - for (i = 0; i < TXG_SIZE; i++) { + for (int i = 0; i < TXG_SIZE; i++) { ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); @@ -227,6 +235,72 @@ dnode_dest(void *arg, void *unused) avl_destroy(&dn->dn_dbufs); } +static int +dnode_kstats_update(kstat_t *ksp, int rw) +{ + dnode_stats_t *ds = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + ds->dnode_hold_dbuf_hold.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_dbuf_hold); + ds->dnode_hold_dbuf_read.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_dbuf_read); + ds->dnode_hold_alloc_hits.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_alloc_hits); + ds->dnode_hold_alloc_misses.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_alloc_misses); + ds->dnode_hold_alloc_interior.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_alloc_interior); + ds->dnode_hold_alloc_lock_retry.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry); + ds->dnode_hold_alloc_lock_misses.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses); + ds->dnode_hold_alloc_type_none.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_alloc_type_none); + ds->dnode_hold_free_hits.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_free_hits); + ds->dnode_hold_free_misses.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_free_misses); + ds->dnode_hold_free_lock_misses.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_free_lock_misses); + ds->dnode_hold_free_lock_retry.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_free_lock_retry); + ds->dnode_hold_free_refcount.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_free_refcount); + ds->dnode_hold_free_overflow.value.ui64 = + wmsum_value(&dnode_sums.dnode_hold_free_overflow); + ds->dnode_free_interior_lock_retry.value.ui64 = + wmsum_value(&dnode_sums.dnode_free_interior_lock_retry); + ds->dnode_allocate.value.ui64 = + wmsum_value(&dnode_sums.dnode_allocate); + ds->dnode_reallocate.value.ui64 = + wmsum_value(&dnode_sums.dnode_reallocate); + ds->dnode_buf_evict.value.ui64 = + wmsum_value(&dnode_sums.dnode_buf_evict); + ds->dnode_alloc_next_chunk.value.ui64 = + wmsum_value(&dnode_sums.dnode_alloc_next_chunk); + ds->dnode_alloc_race.value.ui64 = + wmsum_value(&dnode_sums.dnode_alloc_race); + ds->dnode_alloc_next_block.value.ui64 = + wmsum_value(&dnode_sums.dnode_alloc_next_block); + ds->dnode_move_invalid.value.ui64 = + wmsum_value(&dnode_sums.dnode_move_invalid); + ds->dnode_move_recheck1.value.ui64 = + wmsum_value(&dnode_sums.dnode_move_recheck1); + ds->dnode_move_recheck2.value.ui64 = + wmsum_value(&dnode_sums.dnode_move_recheck2); + ds->dnode_move_special.value.ui64 = + wmsum_value(&dnode_sums.dnode_move_special); + ds->dnode_move_handle.value.ui64 = + wmsum_value(&dnode_sums.dnode_move_handle); + ds->dnode_move_rwlock.value.ui64 = + wmsum_value(&dnode_sums.dnode_move_rwlock); + ds->dnode_move_active.value.ui64 = + wmsum_value(&dnode_sums.dnode_move_active); + return (0); +} + void dnode_init(void) { @@ -235,11 +309,41 @@ dnode_init(void) 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); kmem_cache_set_move(dnode_cache, dnode_move); + wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0); + wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0); + wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0); + wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0); + wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0); + wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0); + wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0); + wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0); + wmsum_init(&dnode_sums.dnode_hold_free_hits, 0); + wmsum_init(&dnode_sums.dnode_hold_free_misses, 0); + wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0); + wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0); + wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0); + wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0); + wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0); + wmsum_init(&dnode_sums.dnode_allocate, 0); + wmsum_init(&dnode_sums.dnode_reallocate, 0); + wmsum_init(&dnode_sums.dnode_buf_evict, 0); + wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0); + wmsum_init(&dnode_sums.dnode_alloc_race, 0); + wmsum_init(&dnode_sums.dnode_alloc_next_block, 0); + wmsum_init(&dnode_sums.dnode_move_invalid, 0); + wmsum_init(&dnode_sums.dnode_move_recheck1, 0); + wmsum_init(&dnode_sums.dnode_move_recheck2, 0); + wmsum_init(&dnode_sums.dnode_move_special, 0); + wmsum_init(&dnode_sums.dnode_move_handle, 0); + wmsum_init(&dnode_sums.dnode_move_rwlock, 0); + wmsum_init(&dnode_sums.dnode_move_active, 0); + dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dnode_ksp != NULL) { dnode_ksp->ks_data = &dnode_stats; + dnode_ksp->ks_update = dnode_kstats_update; kstat_install(dnode_ksp); } } @@ -252,6 +356,35 @@ dnode_fini(void) dnode_ksp = NULL; } + wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold); + wmsum_fini(&dnode_sums.dnode_hold_dbuf_read); + wmsum_fini(&dnode_sums.dnode_hold_alloc_hits); + wmsum_fini(&dnode_sums.dnode_hold_alloc_misses); + wmsum_fini(&dnode_sums.dnode_hold_alloc_interior); + wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry); + wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses); + wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none); + wmsum_fini(&dnode_sums.dnode_hold_free_hits); + wmsum_fini(&dnode_sums.dnode_hold_free_misses); + wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses); + wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry); + wmsum_fini(&dnode_sums.dnode_hold_free_refcount); + wmsum_fini(&dnode_sums.dnode_hold_free_overflow); + wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry); + wmsum_fini(&dnode_sums.dnode_allocate); + wmsum_fini(&dnode_sums.dnode_reallocate); + wmsum_fini(&dnode_sums.dnode_buf_evict); + wmsum_fini(&dnode_sums.dnode_alloc_next_chunk); + wmsum_fini(&dnode_sums.dnode_alloc_race); + wmsum_fini(&dnode_sums.dnode_alloc_next_block); + wmsum_fini(&dnode_sums.dnode_move_invalid); + wmsum_fini(&dnode_sums.dnode_move_recheck1); + wmsum_fini(&dnode_sums.dnode_move_recheck2); + wmsum_fini(&dnode_sums.dnode_move_special); + wmsum_fini(&dnode_sums.dnode_move_handle); + wmsum_fini(&dnode_sums.dnode_move_rwlock); + wmsum_fini(&dnode_sums.dnode_move_active); + kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } @@ -319,7 +452,7 @@ dnode_byteswap(dnode_phys_t *dnp) int i; if (dnp->dn_type == DMU_OT_NONE) { - bzero(dnp, sizeof (dnode_phys_t)); + memset(dnp, 0, sizeof (dnode_phys_t)); return; } @@ -344,20 +477,11 @@ dnode_byteswap(dnode_phys_t *dnp) * dnode dnode is smaller than a regular dnode. */ if (dnp->dn_bonuslen != 0) { - /* - * Note that the bonus length calculated here may be - * longer than the actual bonus buffer. This is because - * we always put the bonus buffer after the last block - * pointer (instead of packing it against the end of the - * dnode buffer). - */ - int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); - int slots = dnp->dn_extra_slots + 1; - size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; dmu_object_byteswap_t byteswap; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); - dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); + dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp), + DN_MAX_BONUS_LEN(dnp)); } /* Swap SPILL block if we have one */ @@ -397,7 +521,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) /* clear any data after the end of the new size */ size_t diff = dn->dn_bonuslen - newsize; char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize; - bzero(data_end, diff); + memset(data_end, 0, diff); } dn->dn_bonuslen = newsize; @@ -598,12 +722,13 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, DNODE_STAT_BUMP(dnode_allocate); ASSERT(dn->dn_type == DMU_OT_NONE); - ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); + ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t))); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || + (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); @@ -751,8 +876,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) { - int i; - ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); @@ -776,29 +899,29 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; ndn->dn_num_slots = odn->dn_num_slots; - bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], + memcpy(ndn->dn_next_type, odn->dn_next_type, sizeof (odn->dn_next_type)); - bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr, sizeof (odn->dn_next_nblkptr)); - bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels, sizeof (odn->dn_next_nlevels)); - bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift, sizeof (odn->dn_next_indblkshift)); - bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype, sizeof (odn->dn_next_bonustype)); - bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk, sizeof (odn->dn_rm_spillblk)); - bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen, sizeof (odn->dn_next_bonuslen)); - bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + memcpy(ndn->dn_next_blksz, odn->dn_next_blksz, sizeof (odn->dn_next_blksz)); - bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0], + memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid, sizeof (odn->dn_next_maxblkid)); - for (i = 0; i < TXG_SIZE; i++) { + for (int i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); } - bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], + memcpy(ndn->dn_free_ranges, odn->dn_free_ranges, sizeof (odn->dn_free_ranges)); ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; @@ -852,7 +975,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) /* * Satisfy the destructor. */ - for (i = 0; i < TXG_SIZE; i++) { + for (int i = 0; i < TXG_SIZE; i++) { list_create(&odn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); @@ -889,7 +1012,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_moved = (uint8_t)-1; } -/*ARGSUSED*/ static kmem_cbrc_t dnode_move(void *buf, void *newbuf, size_t size, void *arg) { @@ -1123,9 +1245,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) return (B_TRUE); } -static void +static uint_t dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) { + uint_t reclaimed = 0; + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); for (int i = idx; i < idx + slots; i++) { @@ -1137,8 +1261,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); dnode_destroy(dnh->dnh_dnode); dnh->dnh_dnode = DN_SLOT_FREE; + reclaimed++; } } + + return (reclaimed); } void @@ -1156,7 +1283,7 @@ dnode_free_interior_slots(dnode_t *dn) while (!dnode_slots_tryenter(children, idx, slots)) { DNODE_STAT_BUMP(dnode_free_interior_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } dnode_set_slots(children, idx, slots, DN_SLOT_FREE); @@ -1273,7 +1400,7 @@ dnode_buf_evict_async(void *dbu) */ int dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, - void *tag, dnode_t **dnp) + const void *tag, dnode_t **dnp) { int epb, idx, err; int drop_struct_lock = FALSE; @@ -1437,7 +1564,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } /* @@ -1451,6 +1578,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, + sizeof (dnode_t)); } } @@ -1492,7 +1621,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } if (!dnode_check_slots_free(dnc, idx, slots)) { @@ -1508,8 +1637,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, * to be freed. Single slot dnodes can be safely * re-purposed as a performance optimization. */ - if (slots > 1) - dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (slots > 1) { + uint_t reclaimed = + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + if (reclaimed > 0) + dmu_buf_sub_user_size(&db->db, + reclaimed * sizeof (dnode_t)); + } dnh = &dnc->dnc_children[idx]; if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { @@ -1517,6 +1651,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } else { dn = dnode_create(os, dn_block + idx, db, object, dnh); + dmu_buf_add_user_size(&db->db, sizeof (dnode_t)); } mutex_enter(&dn->dn_mtx); @@ -1567,7 +1702,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, * Return held dnode if the object is allocated, NULL if not. */ int -dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) +dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, dnp)); @@ -1579,7 +1714,7 @@ dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) * new reference. */ boolean_t -dnode_add_ref(dnode_t *dn, void *tag) +dnode_add_ref(dnode_t *dn, const void *tag) { mutex_enter(&dn->dn_mtx); if (zfs_refcount_is_zero(&dn->dn_holds)) { @@ -1592,14 +1727,14 @@ dnode_add_ref(dnode_t *dn, void *tag) } void -dnode_rele(dnode_t *dn, void *tag) +dnode_rele(dnode_t *dn, const void *tag) { mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, tag, B_FALSE); } void -dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) +dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ @@ -1621,7 +1756,9 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) * other direct or indirect hold on the dnode must first drop the dnode * handle. */ +#ifdef ZFS_DEBUG ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); +#endif /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && db != NULL) { @@ -1649,7 +1786,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode contains any uncommitted dirty records. + * Checks if the dnode itself is dirty, or is carrying any uncommitted records. + * It is important to check both conditions, as some operations (eg appending + * to a file) can dirty both as a single logical unit, but they are not synced + * out atomically, so checking one and not the other can result in an object + * appearing to be clean mid-way through a commit. + * + * Do not change this lightly! If you get it wrong, dmu_offset_next() can + * detect a hole where there is really data, leading to silent corruption. */ boolean_t dnode_is_dirty(dnode_t *dn) @@ -1657,7 +1801,8 @@ dnode_is_dirty(dnode_t *dn) mutex_enter(&dn->dn_mtx); for (int i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i])) { + if (multilist_link_active(&dn->dn_dirty_link[i]) || + !list_is_empty(&dn->dn_dirty_records[i])) { mutex_exit(&dn->dn_mtx); return (B_TRUE); } @@ -1767,7 +1912,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs == dn->dn_indblkshift) ibs = 0; - if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + if (size == dn->dn_datablksz && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); @@ -1790,24 +1935,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs && dn->dn_nlevels != 1) goto fail; - /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) { - dbuf_new_size(db, size, tx); - } else if (err != ENOENT) { - goto fail; - } - - dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (size != dn->dn_datablksz) { + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); + if (err == 0) { + dbuf_new_size(db, size, tx); + } else if (err != ENOENT) { + goto fail; + } + + dnode_setdblksz(dn, size); + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size; + if (db) + dbuf_rele(db, FTAG); + } if (ibs) { dn->dn_indblkshift = ibs; - dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; } - /* release after we have fixed the blocksize in the dnode */ - if (db) - dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); @@ -2032,7 +2178,7 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, } void -dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag) +dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag) { /* * Don't set dirtyctx to SYNC if we're just modifying this as we @@ -2082,7 +2228,7 @@ dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len, dmu_buf_will_dirty(&db->db, tx); data = db->db.db_data; - bzero(data + blkoff, len); + memset(data + blkoff, 0, len); } dbuf_rele(db, FTAG); } @@ -2292,19 +2438,11 @@ dnode_spill_freed(dnode_t *dn) uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) { - void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; if (blkid == DMU_BONUS_BLKID) return (FALSE); - /* - * If we're in the process of opening the pool, dp will not be - * set yet, but there shouldn't be anything dirty. - */ - if (dp == NULL) - return (FALSE); - if (dn->dn_free_txg) return (TRUE); @@ -2419,7 +2557,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, } if (db != NULL && txg != 0 && (db->db_blkptr == NULL || - db->db_blkptr->blk_birth <= txg || + BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg || BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree @@ -2467,7 +2605,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && - (hole || bp[i].blk_birth > txg)) + (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg)) break; if (inc > 0 || *offset > 0) *offset += inc; @@ -2481,8 +2619,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, if (inc < 0) { /* traversing backwards; position offset at the end */ - ASSERT3U(*offset, <=, start); - *offset = MIN(*offset + (1ULL << span) - 1, start); + if (span < 8 * sizeof (*offset)) + *offset = MIN(*offset + (1ULL << span) - 1, + start); } else if (*offset < start) { *offset = start; } @@ -2589,3 +2728,8 @@ EXPORT_SYMBOL(dnode_free_range); EXPORT_SYMBOL(dnode_evict_dbufs); EXPORT_SYMBOL(dnode_evict_bonus); #endif + +ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW, + "Default dnode block shift"); +ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW, + "Default dnode indirect block shift"); diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c index dd37e3af7ed5..f67dad002319 100644 --- a/sys/contrib/openzfs/module/zfs/dnode_sync.c +++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -70,8 +70,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) dmu_buf_impl_t *children[DN_MAX_NBLKPTR]; ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR); for (i = 0; i < nblkptr; i++) { - children[i] = - dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + children[i] = dbuf_find(dn->dn_objset, dn->dn_object, + old_toplvl, i, NULL); } /* transfer dnode's block pointers to new indirect block */ @@ -82,7 +82,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + memcpy(db->db.db_data, dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); arc_buf_freeze(db->db_buf); @@ -119,7 +119,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&child->db_mtx); } - bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + memset(dn->dn_phys->dn_blkptr, 0, sizeof (blkptr_t) * nblkptr); rw_exit(&db->db_rwlock); if (dn->dn_dbuf != NULL) @@ -158,7 +158,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) dmu_object_type_t type = BP_GET_TYPE(bp); uint64_t lvl = BP_GET_LEVEL(bp); - bzero(bp, sizeof (blkptr_t)); + memset(bp, 0, sizeof (blkptr_t)); if (spa_feature_is_active(dn->dn_objset->os_spa, SPA_FEATURE_HOLE_BIRTH)) { @@ -175,19 +175,21 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) static void free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) { - int off, num; - int i, err, epbs; + uint64_t off, num, i, j; + unsigned int epbs; + int err; uint64_t txg = tx->tx_txg; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; - off = start - (db->db_blkid * 1<<epbs); + off = start - (db->db_blkid << epbs); num = end - start + 1; - ASSERT3U(off, >=, 0); - ASSERT3U(num, >=, 0); + ASSERT3U(dn->dn_phys->dn_indblkshift, >=, SPA_BLKPTRSHIFT); + ASSERT3U(end + 1, >=, start); + ASSERT3U(start, >=, (db->db_blkid << epbs)); ASSERT3U(db->db_level, >, 0); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); @@ -197,7 +199,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) uint64_t *buf; dmu_buf_impl_t *child; dbuf_dirty_record_t *dr; - int j; ASSERT(db->db_level == 1); @@ -217,8 +218,11 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); + "child=%p i=%llu off=%llu " + "num=%llu\n", + (void *)child, (u_longlong_t)i, + (u_longlong_t)off, + (u_longlong_t)num); } } } @@ -234,8 +238,11 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); + "child=%p i=%llu off=%llu " + "num=%llu\n", + (void *)child, (u_longlong_t)i, + (u_longlong_t)off, + (u_longlong_t)num); } } } @@ -347,7 +354,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, rw_enter(&db->db_rwlock, RW_WRITER); for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) ASSERT(BP_IS_HOLE(bp)); - bzero(db->db.db_data, db->db.db_size); + memset(db->db.db_data, 0, db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); rw_exit(&db->db_rwlock); } @@ -475,7 +482,14 @@ dnode_evict_dbufs(dnode_t *dn) zfs_refcount_is_zero(&db->db_holds)) { db_marker->db_level = db->db_level; db_marker->db_blkid = db->db_blkid; - db_marker->db_state = DB_SEARCH; + /* + * Insert a MARKER node with the same level and blkid. + * And to resolve any ties in dbuf_compare() use the + * pointer of the dbuf that we are evicting. Pass the + * address in db_parent. + */ + db_marker->db_state = DB_MARKER; + db_marker->db_parent = (void *)((uintptr_t)db - 1); avl_insert_here(&dn->dn_dbufs, db_marker, db, AVL_BEFORE); @@ -597,7 +611,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) ASSERT(dn->dn_free_txg > 0); if (dn->dn_allocated_txg != dn->dn_free_txg) dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); - bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); + memset(dn->dn_phys, 0, sizeof (dnode_phys_t) * dn->dn_num_slots); dnode_free_interior_slots(dn); mutex_enter(&dn->dn_mtx); @@ -620,6 +634,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. + * Does not wait for zio completions. */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) @@ -634,7 +649,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); ASSERT(dnp->dn_type != DMU_OT_NONE || - bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); + memcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); @@ -655,8 +670,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) DNODE_FLAG_USEROBJUSED_ACCOUNTED; mutex_exit(&dn->dn_mtx); dmu_objset_userquota_get_ids(dn, B_FALSE, tx); - } else { - /* Once we account for it, we should always account for it */ + } else if (!(os->os_encrypted && dmu_objset_is_receiving(os))) { + /* + * Once we account for it, we should always account for it, + * except for the case of a raw receive. We will not be able + * to account for it until the receiving dataset has been + * mounted. + */ ASSERT(!(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED)); ASSERT(!(dn->dn_phys->dn_flags & @@ -822,7 +842,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) ASSERT(dn->dn_allocated_txg == tx->tx_txg); if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { /* zero the new blkptrs we are gaining */ - bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + memset(dnp->dn_blkptr + dnp->dn_nblkptr, 0, sizeof (blkptr_t) * (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); #ifdef ZFS_DEBUG @@ -849,6 +869,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } + ASSERT3U(dnp->dn_bonuslen, <=, DN_MAX_BONUS_LEN(dnp)); + /* * Although we have dropped our reference to the dnode, it * can't be evicted until its written, and we haven't yet diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c index bead7da2237f..5fd8bc2a2682 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c +++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c @@ -34,10 +34,11 @@ #include <sys/dsl_bookmark.h> #include <zfs_namecheck.h> #include <sys/dmu_send.h> +#include <sys/dbuf.h> static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, - dsl_dataset_t **dsp, void *tag, char **shortnamep) + dsl_dataset_t **dsp, const void *tag, char **shortnamep) { char buf[ZFS_MAX_DATASET_NAME_LEN]; char *hashp; @@ -82,7 +83,7 @@ dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname, * Zero out the bookmark in case the one stored on disk * is in an older, shorter format. */ - bzero(bmark_phys, sizeof (*bmark_phys)); + memset(bmark_phys, 0, sizeof (*bmark_phys)); err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0, @@ -160,15 +161,14 @@ dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source) int dsl_bookmark_create_nvl_validate(nvlist_t *bmarks) { - char *first; - size_t first_len; + const char *first = NULL; + size_t first_len = 0; - first = NULL; for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { - char *bmark = nvpair_name(pair); - char *source; + const char *bmark = nvpair_name(pair); + const char *source; /* list structure: values must be snapshots XOR bookmarks */ if (nvpair_value_string(pair, &source) != 0) @@ -178,7 +178,7 @@ dsl_bookmark_create_nvl_validate(nvlist_t *bmarks) /* same pool check */ if (first == NULL) { - char *cp = strpbrk(bmark, "/#"); + const char *cp = strpbrk(bmark, "/#"); if (cp == NULL) return (-1); first = bmark; @@ -230,7 +230,6 @@ dsl_bookmark_create_check_impl(dsl_pool_t *dp, switch (error) { case ESRCH: /* happy path: new bmark doesn't exist, proceed after switch */ - error = 0; break; case 0: error = SET_ERROR(EEXIST); @@ -307,11 +306,11 @@ dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - char *new = nvpair_name(pair); + const char *new = nvpair_name(pair); int error = schema_err; if (error == 0) { - char *source = fnvpair_value_string(pair); + const char *source = fnvpair_value_string(pair); error = dsl_bookmark_create_check_impl(dp, new, source); if (error != 0) error = SET_ERROR(error); @@ -347,6 +346,8 @@ dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap) spa_t *spa = dsl_dataset_get_spa(snap); objset_t *mos = spa_get_dsl(spa)->dp_meta_objset; dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap); + + memset(zbm, 0, sizeof (zfs_bookmark_phys_t)); zbm->zbm_guid = dsp->ds_guid; zbm->zbm_creation_txg = dsp->ds_creation_txg; zbm->zbm_creation_time = dsp->ds_creation_time; @@ -380,10 +381,6 @@ dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap) &zbm->zbm_compressed_freed_before_next_snap, &zbm->zbm_uncompressed_freed_before_next_snap); dsl_dataset_rele(nextds, FTAG); - } else { - bzero(&zbm->zbm_flags, - sizeof (zfs_bookmark_phys_t) - - offsetof(zfs_bookmark_phys_t, zbm_flags)); } } @@ -426,8 +423,8 @@ dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn, spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx); } - __attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 }; - ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size, + zfs_bookmark_phys_t zero_phys = { 0 }; + ASSERT0(memcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size, &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size)); VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name, @@ -441,8 +438,8 @@ dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn, */ static void dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, - dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag, - redaction_list_t **redaction_list) + dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, + const void *tag, redaction_list_t **redaction_list) { dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; @@ -463,26 +460,43 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); if (redaction_list != NULL || bookmark_redacted) { redaction_list_t *local_rl; + boolean_t spill = B_FALSE; if (bookmark_redacted) { redact_snaps = dsredactsnaps; num_redact_snaps = dsnumsnaps; } + int bonuslen = sizeof (redaction_list_phys_t) + + num_redact_snaps * sizeof (uint64_t); + if (bonuslen > dmu_bonus_max()) + spill = B_TRUE; dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, - DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + - num_redact_snaps * sizeof (uint64_t), tx); + DMU_OTN_UINT64_METADATA, spill ? 0 : bonuslen, tx); spa_feature_incr(dp->dp_spa, SPA_FEATURE_REDACTION_BOOKMARKS, tx); + if (spill) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_REDACTION_LIST_SPILL, tx); + } VERIFY0(dsl_redaction_list_hold_obj(dp, dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); dsl_redaction_list_long_hold(dp, local_rl, tag); - ASSERT3U((local_rl)->rl_dbuf->db_size, >=, - sizeof (redaction_list_phys_t) + num_redact_snaps * - sizeof (uint64_t)); - dmu_buf_will_dirty(local_rl->rl_dbuf, tx); - bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps, + if (!spill) { + ASSERT3U(local_rl->rl_bonus->db_size, >=, bonuslen); + dmu_buf_will_dirty(local_rl->rl_bonus, tx); + } else { + dmu_buf_t *db; + VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus, + DB_RF_MUST_SUCCEED, FTAG, &db)); + dmu_buf_will_fill(db, tx, B_FALSE); + VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen, + SPA_MINBLOCKSIZE), tx)); + local_rl->rl_phys = db->db_data; + local_rl->rl_dbuf = db; + } + memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps, sizeof (uint64_t) * num_redact_snaps); local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; if (bookmark_redacted) { @@ -593,8 +607,8 @@ dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { - char *new = nvpair_name(pair); - char *source = fnvpair_value_string(pair); + const char *new = nvpair_name(pair); + const char *source = fnvpair_value_string(pair); if (strchr(source, '@') != NULL) { dsl_bookmark_create_sync_impl_snap(new, source, tx, @@ -640,11 +654,15 @@ dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) SPA_FEATURE_REDACTION_BOOKMARKS)) return (SET_ERROR(ENOTSUP)); /* - * If the list of redact snaps will not fit in the bonus buffer with - * the furthest reached object and offset, fail. + * If the list of redact snaps will not fit in the bonus buffer (or + * spill block, with the REDACTION_LIST_SPILL feature) with the + * furthest reached object and offset, fail. */ - if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - - sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) + uint64_t snaplimit = ((spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_REDACTION_LIST_SPILL) ? spa_maxblocksize(dp->dp_spa) : + dmu_bonus_max()) - + sizeof (redaction_list_phys_t)) / sizeof (uint64_t); + if (dbcra->dbcra_numsnaps > snaplimit) return (SET_ERROR(E2BIG)); if (dsl_bookmark_create_nvl_validate_pair( @@ -667,7 +685,8 @@ dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx) int dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot, - uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl) + uint64_t numsnaps, uint64_t *snapguids, const void *tag, + redaction_list_t **rl) { dsl_bookmark_create_redacted_arg_t dbcra; @@ -1043,6 +1062,14 @@ dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, } if (dbn->dbn_phys.zbm_redaction_obj != 0) { + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); + if (rl->dn_have_spill) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_LIST_SPILL, tx); + } + dnode_rele(rl, FTAG); VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), @@ -1191,19 +1218,19 @@ dsl_redaction_list_long_held(redaction_list_t *rl) } void -dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag) +dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, + const void *tag) { ASSERT(dsl_pool_config_held(dp)); (void) zfs_refcount_add(&rl->rl_longholds, tag); } void -dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag) +dsl_redaction_list_long_rele(redaction_list_t *rl, const void *tag) { (void) zfs_refcount_remove(&rl->rl_longholds, tag); } -/* ARGSUSED */ static void redaction_list_evict_sync(void *rlu) { @@ -1214,17 +1241,19 @@ redaction_list_evict_sync(void *rlu) } void -dsl_redaction_list_rele(redaction_list_t *rl, void *tag) +dsl_redaction_list_rele(redaction_list_t *rl, const void *tag) { - dmu_buf_rele(rl->rl_dbuf, tag); + if (rl->rl_bonus != rl->rl_dbuf) + dmu_buf_rele(rl->rl_dbuf, tag); + dmu_buf_rele(rl->rl_bonus, tag); } int -dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag, +dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag, redaction_list_t **rlp) { objset_t *mos = dp->dp_meta_objset; - dmu_buf_t *dbuf; + dmu_buf_t *dbuf, *spill_dbuf; redaction_list_t *rl; int err; @@ -1239,13 +1268,18 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag, redaction_list_t *winner = NULL; rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); - rl->rl_dbuf = dbuf; + rl->rl_bonus = dbuf; + if (dmu_spill_hold_existing(dbuf, tag, &spill_dbuf) == 0) { + rl->rl_dbuf = spill_dbuf; + } else { + rl->rl_dbuf = dbuf; + } rl->rl_object = rlobj; - rl->rl_phys = dbuf->db_data; + rl->rl_phys = rl->rl_dbuf->db_data; rl->rl_mos = dp->dp_meta_objset; zfs_refcount_create(&rl->rl_longholds); dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, - &rl->rl_dbuf); + &rl->rl_bonus); if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { kmem_free(rl, sizeof (*rl)); rl = winner; @@ -1295,7 +1329,7 @@ dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) * The empty-string name can't be in the AVL, and it compares * before any entries with this TXG. */ - search.dbn_name = ""; + search.dbn_name = (char *)""; VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); dsl_bookmark_node_t *dbn = avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); @@ -1422,7 +1456,7 @@ dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin, * The empty-string name can't be in the AVL, and it compares * before any entries with this TXG. */ - search.dbn_name = ""; + search.dbn_name = (char *)""; VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); dsl_bookmark_node_t *dbn = avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); @@ -1470,10 +1504,11 @@ dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin, * Adjust the FBN of any bookmarks that reference this block, whose "next" * is the head dataset. */ -/* ARGSUSED */ void dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { + (void) tx; + /* * Iterate over bookmarks whose "next" is the head dataset. */ @@ -1485,7 +1520,8 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * If the block was live (referenced) at the time of this * bookmark, add its space to the bookmark's FBN. */ - if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg && + if (BP_GET_LOGICAL_BIRTH(bp) <= + dbn->dbn_phys.zbm_creation_txg && (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { mutex_enter(&dbn->dbn_lock); dbn->dbn_phys.zbm_referenced_freed_before_next_snap += diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c index 26d4c2fe7e33..8e1055d9bcb1 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c +++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c @@ -80,13 +80,13 @@ int zfs_disable_ivset_guid_check = 0; static void -dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag) +dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, const void *tag) { (void) zfs_refcount_add(&wkey->wk_refcnt, tag); } static void -dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag) +dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, const void *tag) { (void) zfs_refcount_remove(&wkey->wk_refcnt, tag); } @@ -97,7 +97,7 @@ dsl_wrapping_key_free(dsl_wrapping_key_t *wkey) ASSERT0(zfs_refcount_count(&wkey->wk_refcnt)); if (wkey->wk_key.ck_data) { - bzero(wkey->wk_key.ck_data, + memset(wkey->wk_key.ck_data, 0, CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); kmem_free(wkey->wk_key.ck_data, CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); @@ -119,9 +119,8 @@ dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat, /* allocate and initialize the underlying crypto key */ wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP); - wkey->wk_key.ck_format = CRYPTO_KEY_RAW; wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN); - bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN); + memcpy(wkey->wk_key.ck_data, wkeydata, WRAPPING_KEY_LEN); /* initialize the rest of the struct */ zfs_refcount_create(&wkey->wk_refcnt); @@ -144,7 +143,7 @@ dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, dsl_wrapping_key_t *wkey = NULL; uint8_t *wkeydata = NULL; uint_t wkeydata_len = 0; - char *keylocation = NULL; + const char *keylocation = NULL; dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP); dcp->cp_cmd = cmd; @@ -267,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b) return (0); } +/* + * this compares a crypto key based on zk_guid. See comment on + * spa_crypto_key_compare for more information. + */ +boolean_t +dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb) +{ + dsl_crypto_key_t *dcka = NULL; + dsl_crypto_key_t *dckb = NULL; + uint64_t obja, objb; + boolean_t equal; + spa_t *spa; + + spa = dmu_objset_spa(osa); + if (spa != dmu_objset_spa(osb)) + return (B_FALSE); + obja = dmu_objset_ds(osa)->ds_object; + objb = dmu_objset_ds(osb)->ds_object; + + if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0) + return (B_FALSE); + if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) { + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + return (B_FALSE); + } + + equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid); + + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + spa_keystore_dsl_key_rele(spa, dckb, FTAG); + + return (equal); +} + static int spa_key_mapping_compare(const void *a, const void *b) { @@ -369,7 +402,7 @@ dsl_dir_incompatible_encryption_version(dsl_dir_t *dd) static int spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj, - void *tag, dsl_wrapping_key_t **wkey_out) + const void *tag, dsl_wrapping_key_t **wkey_out) { int ret; dsl_wrapping_key_t search_wkey; @@ -399,7 +432,7 @@ error: } static int -spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, +spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, const void *tag, dsl_wrapping_key_t **wkey_out) { int ret; @@ -515,7 +548,7 @@ dsl_crypto_key_free(dsl_crypto_key_t *dck) } static void -dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag) +dsl_crypto_key_rele(dsl_crypto_key_t *dck, const void *tag) { if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) dsl_crypto_key_free(dck); @@ -523,7 +556,7 @@ dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag) static int dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey, - uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out) + uint64_t dckobj, const void *tag, dsl_crypto_key_t **dck_out) { int ret; uint64_t crypt = 0, guid = 0, version = 0; @@ -542,6 +575,12 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey, if (ret != 0) goto error; + /* handle a future crypto suite that we don't support */ + if (crypt >= ZIO_CRYPT_FUNCTIONS) { + ret = (SET_ERROR(ZFS_ERR_CRYPTO_NOTSUP)); + goto error; + } + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid); if (ret != 0) goto error; @@ -592,7 +631,7 @@ dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey, error: if (dck != NULL) { - bzero(dck, sizeof (dsl_crypto_key_t)); + memset(dck, 0, sizeof (dsl_crypto_key_t)); kmem_free(dck, sizeof (dsl_crypto_key_t)); } @@ -601,7 +640,7 @@ error: } static int -spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag, +spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, const void *tag, dsl_crypto_key_t **dck_out) { int ret; @@ -632,7 +671,7 @@ error: } static int -spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, +spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, const void *tag, dsl_crypto_key_t **dck_out) { int ret; @@ -690,7 +729,7 @@ spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, } void -spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag) +spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, const void *tag) { rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER); @@ -937,7 +976,7 @@ error: } void -key_mapping_add_ref(dsl_key_mapping_t *km, void *tag) +key_mapping_add_ref(dsl_key_mapping_t *km, const void *tag) { ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); zfs_refcount_add(&km->km_refcnt, tag); @@ -954,7 +993,7 @@ key_mapping_add_ref(dsl_key_mapping_t *km, void *tag) * mapping after unmounting a dataset. */ void -key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag) +key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, const void *tag) { ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); @@ -985,7 +1024,7 @@ key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag) } int -spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag, +spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, const void *tag, dsl_key_mapping_t **km_out) { int ret; @@ -1044,7 +1083,7 @@ spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag, } int -spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag) +spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, const void *tag) { int ret; dsl_key_mapping_t search_km; @@ -1082,7 +1121,7 @@ error_unlock: * without getting a reference to it. */ int -spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag, +spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, const void *tag, dsl_crypto_key_t **dck_out) { int ret; @@ -1138,7 +1177,7 @@ dmu_objset_check_wkey_loaded(dsl_dir_t *dd) return (0); } -static zfs_keystatus_t +zfs_keystatus_t dsl_dataset_get_keystatus(dsl_dir_t *dd) { /* check if this dd has a has a dsl key */ @@ -1507,7 +1546,7 @@ spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx) dsl_crypto_params_t *dcp = skcka->skcka_cp; dsl_wrapping_key_t *wkey = NULL, *found_wkey; dsl_wrapping_key_t wkey_search; - char *keylocation = dcp->cp_keylocation; + const char *keylocation = dcp->cp_keylocation; uint64_t rddobj, new_rddobj; /* create and initialize the wrapping key */ @@ -2007,14 +2046,6 @@ dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds, if (ret != 0) return (ret); - /* - * Useraccounting is not portable and must be done with the keys loaded. - * Therefore, whenever we do any kind of receive the useraccounting - * must not be present. - */ - ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); - ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE); - mdn = DMU_META_DNODE(os); /* @@ -2104,8 +2135,9 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, * written out raw next time. */ arc_release(os->os_phys_buf, &os->os_phys_buf); - bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN); - bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN); + memcpy(os->os_phys->os_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + memset(os->os_phys->os_local_mac, 0, ZIO_OBJSET_MAC_LEN); + os->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; /* set metadnode compression and checksum */ @@ -2127,9 +2159,6 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dsl_dataset_sync(ds, zio, tx); VERIFY0(zio_wait(zio)); - - /* dsl_dataset_sync_done will drop this reference. */ - dmu_buf_add_ref(ds->ds_dbuf, ds); dsl_dataset_sync_done(ds, tx); } } @@ -2152,10 +2181,16 @@ dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) * wrapping key. */ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval); - if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS || - intval <= ZIO_CRYPT_OFF) + if (ret != 0 || intval <= ZIO_CRYPT_OFF) return (SET_ERROR(EINVAL)); + /* + * Flag a future crypto suite that we don't support differently, so + * we can return a more useful error to the user. + */ + if (intval >= ZIO_CRYPT_FUNCTIONS) + return (SET_ERROR(ZFS_ERR_CRYPTO_NOTSUP)); + ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval); if (ret != 0) return (SET_ERROR(EINVAL)); @@ -2237,7 +2272,7 @@ dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) uint8_t *keydata, *hmac_keydata, *iv, *mac; uint64_t crypt, key_guid, keyformat, iters, salt; uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION; - char *keylocation = "prompt"; + const char *keylocation = "prompt"; /* lookup the values we need to create the DSL Crypto Key */ crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE); @@ -2555,7 +2590,7 @@ dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx)); zio_crypt_key_destroy(&dck.dck_key); - bzero(&dck.dck_key, sizeof (zio_crypt_key_t)); + memset(&dck.dck_key, 0, sizeof (zio_crypt_key_t)); return (dck.dck_obj); } @@ -2679,6 +2714,7 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, objset_phys_t *osp = buf; uint8_t portable_mac[ZIO_OBJSET_MAC_LEN]; uint8_t local_mac[ZIO_OBJSET_MAC_LEN]; + const uint8_t zeroed_mac[ZIO_OBJSET_MAC_LEN] = {0}; /* look up the key from the spa's keystore */ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); @@ -2695,16 +2731,30 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, /* if we are generating encode the HMACs in the objset_phys_t */ if (generate) { - bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN); - bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN); + memcpy(osp->os_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + memcpy(osp->os_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); abd_return_buf_copy(abd, buf, datalen); return (0); } - if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 || - bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) { - abd_return_buf(abd, buf, datalen); - return (SET_ERROR(ECKSUM)); + if (memcmp(portable_mac, osp->os_portable_mac, + ZIO_OBJSET_MAC_LEN) != 0 || + memcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) { + /* + * If the MAC is zeroed out, we failed to decrypt it. + * This should only arise, at least on Linux, + * if we hit edge case handling for useraccounting, since we + * shouldn't get here without bailing out on error earlier + * otherwise. + * + * So if we're in that case, we can just fall through and + * special-casing noticing that it's zero will handle it + * elsewhere, since we can just regenerate it. + */ + if (memcmp(local_mac, zeroed_mac, ZIO_OBJSET_MAC_LEN) != 0) { + abd_return_buf(abd, buf, datalen); + return (SET_ERROR(ECKSUM)); + } } abd_return_buf(abd, buf, datalen); @@ -2746,11 +2796,11 @@ spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd, * Otherwise verify that the MAC matched what we expected. */ if (generate) { - bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN); + memcpy(mac, digestbuf, ZIO_DATA_MAC_LEN); return (0); } - if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0) + if (memcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0) return (SET_ERROR(ECKSUM)); return (0); @@ -2849,9 +2899,9 @@ spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb, error: if (encrypt) { /* zero out any state we might have changed while encrypting */ - bzero(salt, ZIO_DATA_SALT_LEN); - bzero(iv, ZIO_DATA_IV_LEN); - bzero(mac, ZIO_DATA_MAC_LEN); + memset(salt, 0, ZIO_DATA_SALT_LEN); + memset(iv, 0, ZIO_DATA_IV_LEN); + memset(mac, 0, ZIO_DATA_MAC_LEN); abd_return_buf(pabd, plainbuf, datalen); abd_return_buf_copy(cabd, cipherbuf, datalen); } else { diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index f99964511aa6..b4de0e7ff073 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -73,13 +73,22 @@ * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, - * we do not allow the recordsize to be set larger than zfs_max_recordsize - * (default 1MB). Larger blocks can be created by changing this tunable, - * and pools with larger blocks can always be imported and used, regardless - * of this setting. + * we did not allow the recordsize to be set larger than zfs_max_recordsize + * (former default: 1MB). Larger blocks could be created by changing this + * tunable, and pools with larger blocks could always be imported and used, + * regardless of this setting. + * + * We do, however, still limit it by default to 1M on x86_32, because Linux's + * 3/1 memory split doesn't leave much room for 16M chunks. */ -int zfs_max_recordsize = 1 * 1024 * 1024; -int zfs_allow_redacted_dataset_mount = 0; +#ifdef _ILP32 +uint_t zfs_max_recordsize = 1 * 1024 * 1024; +#else +uint_t zfs_max_recordsize = 16 * 1024 * 1024; +#endif +static int zfs_allow_redacted_dataset_mount = 0; + +int zfs_snapshot_history_enabled = 1; #define SWITCH64(x, y) \ { \ @@ -90,8 +99,6 @@ int zfs_allow_redacted_dataset_mount = 0; #define DS_REF_MAX (1ULL << 62) -extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); - static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx); static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, @@ -99,7 +106,7 @@ static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f); -extern int spa_asize_inflation; +extern uint_t spa_asize_inflation; static zil_header_t zero_zil; @@ -149,7 +156,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) return; } - ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); @@ -183,7 +191,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - bp->blk_birth > ds->ds_dir->dd_origin_txg && + BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -229,7 +237,7 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, mutex_exit(&ds->ds_remap_deadlist_lock); BP_ZERO(&fakebp); - fakebp.blk_birth = birth; + BP_SET_LOGICAL_BIRTH(&fakebp, birth); DVA_SET_VDEV(dva, vdev); DVA_SET_OFFSET(dva, offset); DVA_SET_ASIZE(dva, size); @@ -252,7 +260,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, return (0); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(bp->blk_birth <= tx->tx_txg); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); @@ -270,7 +278,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - bp->blk_birth > ds->ds_dir->dd_origin_txg && + BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -278,7 +286,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, bplist_append(&ds->ds_dir->dd_pending_frees, bp); } - if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); @@ -310,16 +318,16 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); - /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ + /* if (logical birth > prev prev snap txg) prev unique += bs */ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == - ds->ds_object && bp->blk_birth > + ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -524,7 +532,7 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, matchtype_t mt = 0; int err; - dsl_dir_snap_cmtime_update(ds->ds_dir); + dsl_dir_snap_cmtime_update(ds->ds_dir, tx); if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; @@ -541,7 +549,7 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, } boolean_t -dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) +dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, const void *tag) { dmu_buf_t *dbuf = ds->ds_dbuf; boolean_t result = B_FALSE; @@ -559,7 +567,7 @@ dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) } int -dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; @@ -633,6 +641,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } + if (err != 0) + goto after_dsl_bookmark_fini; err = dsl_bookmark_init_ds(ds); } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) @@ -681,11 +691,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); if (err != 0 || winner != NULL) { - bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); dsl_bookmark_fini_ds(ds); +after_dsl_bookmark_fini: if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); @@ -696,6 +706,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, list_destroy(&ds->ds_prop_cbs); list_destroy(&ds->ds_sendstreams); + bplist_destroy(&ds->ds_pending_deadlist); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); @@ -748,7 +759,7 @@ dsl_dataset_create_key_mapping(dsl_dataset_t *ds) int dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, - ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) + ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { int err; @@ -769,7 +780,7 @@ dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, int dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, - void *tag, dsl_dataset_t **dsp) + const void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; const char *snapname; @@ -822,7 +833,7 @@ dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, } int -dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, +dsl_dataset_hold(dsl_pool_t *dp, const char *name, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp)); @@ -830,7 +841,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, static int dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, - void *tag, boolean_t override, dsl_dataset_t **dsp) + const void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp); if (err != 0) @@ -846,21 +857,21 @@ dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, int dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, - void *tag, dsl_dataset_t **dsp) + const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp)); } int dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj, - ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) + ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp)); } static int dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, - void *tag, boolean_t override, dsl_dataset_t **dsp) + const void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp); if (err != 0) @@ -874,14 +885,14 @@ dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, int dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, - void *tag, dsl_dataset_t **dsp) + const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp)); } int dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, - void *tag, dsl_dataset_t **dsp) + const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp)); } @@ -896,14 +907,14 @@ dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, * and accessed. */ void -dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) +dsl_dataset_long_hold(dsl_dataset_t *ds, const void *tag) { ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); (void) zfs_refcount_add(&ds->ds_longholds, tag); } void -dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) +dsl_dataset_long_rele(dsl_dataset_t *ds, const void *tag) { (void) zfs_refcount_remove(&ds->ds_longholds, tag); } @@ -960,7 +971,7 @@ dsl_dataset_namelen(dsl_dataset_t *ds) } void -dsl_dataset_rele(dsl_dataset_t *ds, void *tag) +dsl_dataset_rele(dsl_dataset_t *ds, const void *tag) { dmu_buf_rele(ds->ds_dbuf, tag); } @@ -978,7 +989,8 @@ dsl_dataset_remove_key_mapping(dsl_dataset_t *ds) } void -dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) +dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, + const void *tag) { if (flags & DS_HOLD_FLAG_DECRYPT) dsl_dataset_remove_key_mapping(ds); @@ -987,7 +999,7 @@ dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) } void -dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) +dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, const void *tag) { ASSERT3P(ds->ds_owner, ==, tag); ASSERT(ds->ds_dbuf != NULL); @@ -1000,7 +1012,7 @@ dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override) +dsl_dataset_tryown(dsl_dataset_t *ds, const void *tag, boolean_t override) { boolean_t gotit = FALSE; @@ -1150,7 +1162,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; - bzero(dsphys, sizeof (dsl_dataset_phys_t)); + memset(dsphys, 0, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); @@ -1250,20 +1262,17 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); - if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { + if (memcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { dsl_pool_t *dp = ds->ds_dir->dd_pool; zio_t *zio; - bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + memset(&os->os_zil_header, 0, sizeof (os->os_zil_header)); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dsl_dataset_sync(ds, zio, tx); VERIFY0(zio_wait(zio)); - - /* dsl_dataset_sync_done will drop this reference. */ - dmu_buf_add_ref(ds->ds_dbuf, ds); dsl_dataset_sync_done(ds, tx); } } @@ -1612,7 +1621,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) for (pair = nvlist_next_nvpair(cnt_track, NULL); pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { int error = 0; - char *name; + const char *name; uint64_t cnt = 0; dsl_dataset_t *ds; @@ -1644,7 +1653,7 @@ dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { int error = 0; dsl_dataset_t *ds; - char *name, *atp = NULL; + const char *name, *atp = NULL; char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); @@ -1687,7 +1696,6 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; - static zil_header_t zero_zil __maybe_unused; objset_t *os __maybe_unused; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); @@ -1698,7 +1706,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, */ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || dmu_objset_from_ds(ds, &os) != 0 || - bcmp(&os->os_phys->os_zil_header, &zero_zil, + memcmp(&os->os_phys->os_zil_header, &zero_zil, sizeof (zero_zil)) == 0); /* Should not snapshot a dirty dataset. */ @@ -1720,7 +1728,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; - bzero(dsphys, sizeof (dsl_dataset_phys_t)); + memset(dsphys, 0, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, @@ -1854,9 +1862,10 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dsl_scan_ds_snapshotted(ds, tx); - dsl_dir_snap_cmtime_update(ds->ds_dir); + dsl_dir_snap_cmtime_update(ds->ds_dir, tx); - spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); + if (zfs_snapshot_history_enabled) + spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); } void @@ -1869,7 +1878,7 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { dsl_dataset_t *ds; - char *name, *atp; + const char *name, *atp; char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); @@ -1898,7 +1907,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) boolean_t needsuspend; int error; spa_t *spa; - char *firstname; + const char *firstname; nvlist_t *suspended = NULL; pair = nvlist_next_nvpair(snaps, NULL); @@ -1917,8 +1926,8 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; - char *snapname = nvpair_name(pair); - char *atp; + const char *snapname = nvpair_name(pair); + const char *atp; void *cookie; atp = strchr(snapname, '@'); @@ -2061,8 +2070,9 @@ dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, return (error); } +/* Nonblocking dataset sync. Assumes dataset:objset is always 1:1 */ void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); @@ -2090,17 +2100,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; } - dmu_objset_sync(ds->ds_objset, zio, tx); - - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { - if (zfeature_active(f, ds->ds_feature_activation[f])) { - if (zfeature_active(f, ds->ds_feature[f])) - continue; - dsl_dataset_activate_feature(ds->ds_object, f, - ds->ds_feature_activation[f], tx); - ds->ds_feature[f] = ds->ds_feature_activation[f]; - } - } + dmu_objset_sync(ds->ds_objset, rio, tx); } /* @@ -2116,8 +2116,6 @@ dsl_livelist_should_disable(dsl_dataset_t *ds) used = dsl_dir_get_usedds(ds->ds_dir); referenced = dsl_get_referenced(ds); - ASSERT3U(referenced, >=, 0); - ASSERT3U(used, >=, 0); if (referenced == 0) return (B_FALSE); percent_shared = (100 * (referenced - used)) / referenced; @@ -2272,9 +2270,18 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) else ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]); - ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (zfeature_active(f, + ds->ds_feature_activation[f])) { + if (zfeature_active(f, ds->ds_feature[f])) + continue; + dsl_dataset_activate_feature(ds->ds_object, f, + ds->ds_feature_activation[f], tx); + ds->ds_feature[f] = ds->ds_feature_activation[f]; + } + } - dmu_buf_rele(ds->ds_dbuf, ds); + ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); } int @@ -2331,161 +2338,147 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) nvlist_free(propval); } -/* - * Returns a string that represents the receive resume stats token. It should - * be freed with strfree(). - */ -char * -get_receive_resume_stats_impl(dsl_dataset_t *ds) +static char * +get_receive_resume_token_impl(dsl_dataset_t *ds) { + if (!dsl_dataset_has_resume_receive_state(ds)) + return (NULL); + dsl_pool_t *dp = ds->ds_dir->dd_pool; + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + size_t packed_size, compressed_size; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + char buf[MAXNAMELEN]; + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_LARGEBLOCK) == 0) { + fnvlist_add_boolean(token_nv, "largeblockok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_COMPRESSOK) == 0) { + fnvlist_add_boolean(token_nv, "compressok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_RAWOK) == 0) { + fnvlist_add_boolean(token_nv, "rawok"); + } + if (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)) { + uint64_t num_redact_snaps = 0; + uint64_t *redact_snaps = NULL; + VERIFY3B(dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps, + &redact_snaps), ==, B_TRUE); + fnvlist_add_uint64_array(token_nv, "redact_snaps", + redact_snaps, num_redact_snaps); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) { + uint64_t num_redact_snaps = 0, int_size = 0; + uint64_t *redact_snaps = NULL; + VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size, + &num_redact_snaps)); + ASSERT3U(int_size, ==, sizeof (uint64_t)); - if (dsl_dataset_has_resume_receive_state(ds)) { - char *str; - void *packed; - uint8_t *compressed; - uint64_t val; - nvlist_t *token_nv = fnvlist_alloc(); - size_t packed_size, compressed_size; - - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "fromguid", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "object", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "offset", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "bytes", val); - } - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { - fnvlist_add_uint64(token_nv, "toguid", val); - } - char buf[MAXNAMELEN]; - if (zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { - fnvlist_add_string(token_nv, "toname", buf); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_LARGEBLOCK) == 0) { - fnvlist_add_boolean(token_nv, "largeblockok"); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_EMBEDOK) == 0) { - fnvlist_add_boolean(token_nv, "embedok"); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_COMPRESSOK) == 0) { - fnvlist_add_boolean(token_nv, "compressok"); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_RAWOK) == 0) { - fnvlist_add_boolean(token_nv, "rawok"); - } - if (dsl_dataset_feature_is_active(ds, - SPA_FEATURE_REDACTED_DATASETS)) { - uint64_t num_redact_snaps; - uint64_t *redact_snaps; - VERIFY(dsl_dataset_get_uint64_array_feature(ds, - SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps, - &redact_snaps)); - fnvlist_add_uint64_array(token_nv, "redact_snaps", - redact_snaps, num_redact_snaps); - } - if (zap_contains(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) { - uint64_t num_redact_snaps, int_size; - uint64_t *redact_snaps; - VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size, - &num_redact_snaps)); - ASSERT3U(int_size, ==, sizeof (uint64_t)); - - redact_snaps = kmem_alloc(int_size * num_redact_snaps, - KM_SLEEP); - VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object, - DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size, - num_redact_snaps, redact_snaps)); - fnvlist_add_uint64_array(token_nv, "book_redact_snaps", - redact_snaps, num_redact_snaps); - kmem_free(redact_snaps, int_size * num_redact_snaps); - } - packed = fnvlist_pack(token_nv, &packed_size); - fnvlist_free(token_nv); - compressed = kmem_alloc(packed_size, KM_SLEEP); - - compressed_size = gzip_compress(packed, compressed, - packed_size, packed_size, 6); - - zio_cksum_t cksum; - fletcher_4_native_varsize(compressed, compressed_size, &cksum); - - size_t alloc_size = compressed_size * 2 + 1; - str = kmem_alloc(alloc_size, KM_SLEEP); - for (int i = 0; i < compressed_size; i++) { - size_t offset = i * 2; - (void) snprintf(str + offset, alloc_size - offset, - "%02x", compressed[i]); - } - str[compressed_size * 2] = '\0'; - char *propval = kmem_asprintf("%u-%llx-%llx-%s", - ZFS_SEND_RESUME_TOKEN_VERSION, - (longlong_t)cksum.zc_word[0], - (longlong_t)packed_size, str); - kmem_free(packed, packed_size); - kmem_free(str, alloc_size); - kmem_free(compressed, packed_size); - return (propval); - } - return (kmem_strdup("")); + redact_snaps = kmem_alloc(int_size * num_redact_snaps, + KM_SLEEP); + VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size, + num_redact_snaps, redact_snaps)); + fnvlist_add_uint64_array(token_nv, "book_redact_snaps", + redact_snaps, num_redact_snaps); + kmem_free(redact_snaps, int_size * num_redact_snaps); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + zio_cksum_t cksum; + fletcher_4_native_varsize(compressed, compressed_size, &cksum); + + size_t alloc_size = compressed_size * 2 + 1; + str = kmem_alloc(alloc_size, KM_SLEEP); + for (int i = 0; i < compressed_size; i++) { + size_t offset = i * 2; + (void) snprintf(str + offset, alloc_size - offset, + "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + char *propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + kmem_free(packed, packed_size); + kmem_free(str, alloc_size); + kmem_free(compressed, packed_size); + return (propval); } /* - * Returns a string that represents the receive resume stats token of the - * dataset's child. It should be freed with strfree(). + * Returns a string that represents the receive resume state token. It should + * be freed with strfree(). NULL is returned if no resume state is present. */ char * -get_child_receive_stats(dsl_dataset_t *ds) +get_receive_resume_token(dsl_dataset_t *ds) { - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + char *token = get_receive_resume_token_impl(ds); + if (token != NULL) + return (token); + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + /* 6 extra bytes for /%recv */ + char name[ZFS_MAX_DATASET_NAME_LEN + 6]; dsl_dataset_t *recv_ds; - dsl_dataset_name(ds, recvname); - if (strlcat(recvname, "/", sizeof (recvname)) < - sizeof (recvname) && - strlcat(recvname, recv_clone_name, sizeof (recvname)) < - sizeof (recvname) && - dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG, - &recv_ds) == 0) { - char *propval = get_receive_resume_stats_impl(recv_ds); + dsl_dataset_name(ds, name); + if (strlcat(name, "/", sizeof (name)) < sizeof (name) && + strlcat(name, recv_clone_name, sizeof (name)) < sizeof (name) && + dsl_dataset_hold(ds->ds_dir->dd_pool, name, FTAG, &recv_ds) == 0) { + token = get_receive_resume_token_impl(recv_ds); dsl_dataset_rele(recv_ds, FTAG); - return (propval); } - return (kmem_strdup("")); -} - -static void -get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) -{ - char *propval = get_receive_resume_stats_impl(ds); - if (strcmp(propval, "") != 0) { - dsl_prop_nvlist_add_string(nv, - ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); - } else { - char *childval = get_child_receive_stats(ds); - if (strcmp(childval, "") != 0) { - dsl_prop_nvlist_add_string(nv, - ZFS_PROP_RECEIVE_RESUME_TOKEN, childval); - } - kmem_strfree(childval); - } - kmem_strfree(propval); + return (token); } uint64_t @@ -2744,6 +2737,8 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, relpath[0] != '\0')) mnt = value + 1; + mnt = kmem_strdup(mnt); + if (relpath[0] == '\0') { (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s", root, mnt); @@ -2753,6 +2748,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, relpath); } kmem_free(buf, ZAP_MAXVALUELEN); + kmem_strfree(mnt); } return (0); @@ -2761,7 +2757,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); @@ -2812,6 +2808,8 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_get_userrefs(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, dsl_get_defer_destroy(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOTS_CHANGED, + dsl_dir_snap_cmtime(ds->ds_dir).tv_sec); dsl_dataset_crypt_stats(ds, nv); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { @@ -2823,28 +2821,11 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) } if (!dsl_dataset_is_snapshot(ds)) { - /* - * A failed "newfs" (e.g. full) resumable receive leaves - * the stats set on this dataset. Check here for the prop. - */ - get_receive_resume_stats(ds, nv); - - /* - * A failed incremental resumable receive leaves the - * stats set on our child named "%recv". Check the child - * for the prop. - */ - /* 6 extra bytes for /%recv */ - char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; - dsl_dataset_t *recv_ds; - dsl_dataset_name(ds, recvname); - if (strlcat(recvname, "/", sizeof (recvname)) < - sizeof (recvname) && - strlcat(recvname, recv_clone_name, sizeof (recvname)) < - sizeof (recvname) && - dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { - get_receive_resume_stats(recv_ds, nv); - dsl_dataset_rele(recv_ds, FTAG); + char *token = get_receive_resume_token(ds); + if (token != NULL) { + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, token); + kmem_strfree(token); } } } @@ -2915,7 +2896,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) if (snap == NULL) return (B_FALSE); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - birth = dsl_dataset_get_blkptr(ds)->blk_birth; + birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds)); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; @@ -2928,26 +2909,18 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) return (B_TRUE); if (dmu_objset_from_ds(snap, &os_snap) != 0) return (B_TRUE); - return (bcmp(&os->os_phys->os_meta_dnode, + return (memcmp(&os->os_phys->os_meta_dnode, &os_snap->os_phys->os_meta_dnode, sizeof (os->os_phys->os_meta_dnode)) != 0); } return (B_FALSE); } -typedef struct dsl_dataset_rename_snapshot_arg { - const char *ddrsa_fsname; - const char *ddrsa_oldsnapname; - const char *ddrsa_newsnapname; - boolean_t ddrsa_recursive; - dmu_tx_t *ddrsa_tx; -} dsl_dataset_rename_snapshot_arg_t; - -/* ARGSUSED */ static int dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { + (void) dp; dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; int error; uint64_t val; @@ -2973,7 +2946,7 @@ dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, return (error); } -static int +int dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; @@ -3035,7 +3008,7 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, return (0); } -static void +void dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; @@ -3299,8 +3272,8 @@ struct promotenode { static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, - void *tag); -static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); + const void *tag); +static void promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag); int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) @@ -3309,7 +3282,6 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; - dsl_dataset_t *origin_ds, *origin_head; int err; uint64_t unused; uint64_t ss_mv_cnt; @@ -3329,12 +3301,11 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) } snap = list_head(&ddpa->shared_snaps); - origin_head = snap->ds; if (snap == NULL) { err = SET_ERROR(ENOENT); goto out; } - origin_ds = snap->ds; + dsl_dataset_t *const origin_ds = snap->ds; /* * Encrypted clones share a DSL Crypto Key with their origin's dsl dir. @@ -3430,10 +3401,10 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) * Check that bookmarks that are being transferred don't have * name conflicts. */ - for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); + for (dsl_bookmark_node_t *dbn = avl_first(&origin_ds->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= dsl_dataset_phys(origin_ds)->ds_creation_txg; - dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) { + dbn = AVL_NEXT(&origin_ds->ds_bookmarks, dbn)) { if (strlen(dbn->dbn_name) >= max_snap_len) { err = SET_ERROR(ENAMETOOLONG); goto out; @@ -3447,7 +3418,8 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) conflicting_snaps = B_TRUE; } else if (err == ESRCH) { err = 0; - } else if (err != 0) { + } + if (err != 0) { goto out; } } @@ -3741,6 +3713,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); + + /* + * Transfer common error blocks from old head to new head. + */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) { + uint64_t old_head = origin_head->ds_object; + uint64_t new_head = hds->ds_object; + spa_swap_errlog(dp->dp_spa, new_head, old_head, tx); + } } /* @@ -3751,7 +3732,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) */ static int snaplist_make(dsl_pool_t *dp, - uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) + uint64_t first_obj, uint64_t last_obj, list_t *l, const void *tag) { uint64_t obj = last_obj; @@ -3796,15 +3777,14 @@ snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) } static void -snaplist_destroy(list_t *l, void *tag) +snaplist_destroy(list_t *l, const void *tag) { struct promotenode *snap; if (l == NULL || !list_link_active(&l->list_head)) return; - while ((snap = list_tail(l)) != NULL) { - list_remove(l, snap); + while ((snap = list_remove_tail(l)) != NULL) { dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } @@ -3812,7 +3792,7 @@ snaplist_destroy(list_t *l, void *tag) } static int -promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) +promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, const void *tag) { int error; dsl_dir_t *dd; @@ -3862,7 +3842,7 @@ out: } static void -promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) +promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag) { snaplist_destroy(&ddpa->shared_snaps, tag); snaplist_destroy(&ddpa->clone_snaps, tag); @@ -4305,7 +4285,6 @@ typedef struct dsl_dataset_set_qr_arg { } dsl_dataset_set_qr_arg_t; -/* ARGSUSED */ static int dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) { @@ -4512,7 +4491,6 @@ typedef struct dsl_dataset_set_compression_arg { uint64_t ddsca_value; } dsl_dataset_set_compression_arg_t; -/* ARGSUSED */ static int dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx) { @@ -4540,6 +4518,7 @@ dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx) uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); spa_feature_t f = zio_compress_to_feature(compval); + ASSERT3S(f, !=, SPA_FEATURE_NONE); ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds)); @@ -4951,7 +4930,7 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, if (num_redact_snaps > 0) { ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t), KM_SLEEP); - bcopy(redact_snaps, ftuaa->array, num_redact_snaps * + memcpy(ftuaa->array, redact_snaps, num_redact_snaps * sizeof (uint64_t)); } dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS, @@ -4959,19 +4938,45 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; } -/* BEGIN CSTYLED */ -#if defined(_LP64) -#define RECORDSIZE_PERM ZMOD_RW -#else -/* Limited to 1M on 32-bit platforms due to lack of virtual address space */ -#define RECORDSIZE_PERM ZMOD_RD -#endif -ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, +/* + * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj + * dataset whose birth time is >= min_txg. + */ +int +dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg, + uint64_t *oldest_dsobj) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (prev_obj != 0 && min_txg < prev_obj_txg) { + dsl_dataset_rele(ds, FTAG); + if ((error = dsl_dataset_hold_obj(dp, prev_obj, + FTAG, &ds)) != 0) + return (error); + prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + *oldest_dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + return (0); +} + +ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, UINT, ZMOD_RW, "Max allowed record size"); ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, "Allow mounting of redacted datasets"); -/* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs, zfs_, snapshot_history_enabled, INT, ZMOD_RW, + "Include snapshot events in pool history/events"); EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_flags); diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c index a77e381520db..eff1f7de7731 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c +++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -92,7 +92,7 @@ * will be loaded into memory and shouldn't take up an inordinate amount of * space. We settled on ~500000 entries, corresponding to roughly 128M. */ -unsigned long zfs_livelist_max_entries = 500000; +uint64_t zfs_livelist_max_entries = 500000; /* * We can approximate how much of a performance gain a livelist will give us @@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dle->dle_bpobj.bpo_object = za.za_first_integer; - dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_tree, dle); } @@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dlce->dlce_bpobj = za.za_first_integer; - dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_cache, dlce); } VERIFY3U(error, ==, ENOENT); @@ -438,6 +438,18 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, } } +/* + * Prefetch metadata required for dle_enqueue_subobj(). + */ +static void +dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj) +{ + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) + bpobj_prefetch_subobj(&dle->dle_bpobj, obj); +} + void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) @@ -462,7 +474,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); - dle_tofind.dle_mintxg = bp->blk_birth; + dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp); dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); @@ -471,7 +483,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, if (dle == NULL) { zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", - bp, (longlong_t)bp->blk_birth); + bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp)); dle = avl_first(&dl->dl_tree); } @@ -542,6 +554,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); ASSERT3P(dle, !=, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); + ASSERT3P(dle_prev, !=, NULL); dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); @@ -809,6 +822,27 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dle_enqueue_subobj(dl, dle, obj, tx); } +/* + * Prefetch metadata required for dsl_deadlist_insert_bpobj(). + */ +static void +dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + dle_prefetch_subobj(dl, dle, obj); +} + static int dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) @@ -825,12 +859,12 @@ dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) { - zap_cursor_t zc; - zap_attribute_t za; + zap_cursor_t zc, pzc; + zap_attribute_t *za, *pza; dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; - int error; + int error, perror, i; VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { @@ -841,23 +875,46 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) return; } + za = kmem_alloc(sizeof (*za), KM_SLEEP); + pza = kmem_alloc(sizeof (*pza), KM_SLEEP); + mutex_enter(&dl->dl_lock); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; + (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128; + zap_cursor_advance(&pzc), i++) { + dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, + zfs_strtonum(pza->za_name, NULL)); + } for (zap_cursor_init(&zc, dl->dl_os, obj); - (error = zap_cursor_retrieve(&zc, &za)) == 0; + (error = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { - uint64_t mintxg = zfs_strtonum(za.za_name, NULL); - dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); - VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); + dsl_deadlist_insert_bpobj(dl, za->za_first_integer, + zfs_strtonum(za->za_name, NULL), tx); + VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx)); + if (perror == 0) { + dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, + zfs_strtonum(pza->za_name, NULL)); + zap_cursor_advance(&pzc); + perror = zap_cursor_retrieve(&pzc, pza); + } } VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); + zap_cursor_fini(&pzc); VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; dmu_buf_will_dirty(bonus, tx); - bzero(dlp, sizeof (*dlp)); + memset(dlp, 0, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); mutex_exit(&dl->dl_lock); + + kmem_free(za, sizeof (*za)); + kmem_free(pza, sizeof (*pza)); } /* @@ -868,8 +925,9 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t *dle, *pdle; avl_index_t where; + int i; ASSERT(!dl->dl_oldfmt); @@ -881,11 +939,23 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (pdle = dle, i = 0; pdle && i < 128; i++) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } while (dle) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t *dle_next; bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + if (pdle) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); @@ -930,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg) /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); - if (l_dva0_offset == r_dva0_offset) - ASSERT3U(l->blk_birth, ==, r->blk_birth); return (TREE_CMP(l_dva0_offset, r_dva0_offset)); } @@ -946,9 +1014,9 @@ struct livelist_iter_arg { * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a * corresponding FREE are stored in the supplied bplist. * - * Note that multiple FREE and ALLOC entries for the same blkptr may - * be encountered when dedup is involved. For this reason we keep a - * refcount for all the FREE entries of each blkptr and ensure that + * Note that multiple FREE and ALLOC entries for the same blkptr may be + * encountered when dedup or block cloning is involved. For this reason we + * keep a refcount for all the FREE entries of each blkptr and ensure that * each of those FREE entries has a corresponding ALLOC preceding it. */ static int @@ -967,6 +1035,12 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, livelist_entry_t node; node.le_bp = *bp; livelist_entry_t *found = avl_find(avl, &node, NULL); + if (found) { + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp)); + } if (bp_freed) { if (found == NULL) { /* first free entry for this blkptr */ @@ -976,10 +1050,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, e->le_refcnt = 1; avl_add(avl, e); } else { - /* dedup block free */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); + /* + * Deduped or cloned block free. We could assert D bit + * for dedup, but there is no such one for cloning. + */ ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); found->le_refcnt++; } @@ -995,14 +1069,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, /* all tracked free pairs have been matched */ avl_remove(avl, found); kmem_free(found, sizeof (livelist_entry_t)); - } else { - /* - * This is definitely a deduped blkptr so - * let's validate it. - */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); } } } @@ -1028,16 +1094,19 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, .t = t }; int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); + VERIFY(err != 0 || avl_numnodes(&avl) == 0); - VERIFY0(avl_numnodes(&avl)); + void *cookie = NULL; + livelist_entry_t *le = NULL; + while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) { + kmem_free(le, sizeof (livelist_entry_t)); + } avl_destroy(&avl); return (err); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW, "Size to start the next sub-livelist in a livelist"); ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, "Threshold at which livelist is disabled"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dsl_deleg.c b/sys/contrib/openzfs/module/zfs/dsl_deleg.c index cf8a3c9bbdfb..645ad8e5b8dc 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_deleg.c +++ b/sys/contrib/openzfs/module/zfs/dsl_deleg.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c index a2748197f29d..d4a6e5b6e9fd 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c +++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -49,6 +49,8 @@ #include <sys/zthr.h> #include <sys/spa_impl.h> +extern int zfs_snapshot_history_enabled; + int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) { @@ -130,10 +132,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); - if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { + if (BP_GET_LOGICAL_BIRTH(bp) <= + dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && - bp->blk_birth > + BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); @@ -200,7 +203,7 @@ rck_alloc(dsl_dataset_t *clone) static void dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx, - list_t *stack, void *tag) + list_t *stack, const void *tag) { objset_t *mos = dd->dd_pool->dp_meta_objset; @@ -311,7 +314,8 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, + tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(zfs_refcount_is_zero(&ds->ds_longholds)); @@ -321,14 +325,19 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; - spa_history_log_internal_ds(ds, "defer_destroy", tx, " "); + if (zfs_snapshot_history_enabled) { + spa_history_log_internal_ds(ds, "defer_destroy", tx, + " "); + } return; } ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, " "); + if (zfs_snapshot_history_enabled) { + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, " "); + } dsl_scan_ds_destroyed(ds, tx); @@ -651,7 +660,7 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, zfs_lua_max_memlimit, fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result); if (error != 0) { - char *errorstr = NULL; + const char *errorstr = NULL; (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr); if (errorstr != NULL) { zfs_dbgmsg("%s", errorstr); @@ -699,11 +708,11 @@ struct killarg { dmu_tx_t *tx; }; -/* ARGSUSED */ static int kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { + (void) spa, (void) dnp; struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; @@ -720,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); - ASSERT3U(bp->blk_birth, >, + ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } @@ -1010,7 +1019,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT(ds->ds_prev == NULL || dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, + tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); @@ -1118,6 +1128,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { if (dbn->dbn_phys.zbm_redaction_obj != 0) { + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, + &rl)); + if (rl->dn_have_spill) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_LIST_SPILL, + tx); + } + dnode_rele(rl, FTAG); VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), @@ -1153,6 +1173,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); dsl_dataset_rele(prev, FTAG); } + /* Delete errlog. */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) + spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx); } void @@ -1246,10 +1269,10 @@ dsl_destroy_head(const char *name) * inconsistent datasets, even if we encounter an error trying to * process one of them. */ -/* ARGSUSED */ int dsl_destroy_inconsistent(const char *dsname, void *arg) { + (void) arg; objset_t *os; if (dmu_objset_hold(dsname, FTAG, &os) == 0) { diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c index 84caace4dbab..baf970121a61 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dir.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -26,6 +26,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ #include <sys/dmu.h> @@ -54,6 +55,15 @@ #include "zfs_prop.h" /* + * This controls if we verify the ZVOL quota or not. + * Currently, quotas are not implemented for ZVOLs. + * The quota size is the size of the ZVOL. + * The size of the volume already implies the ZVOL size quota. + * The quota mechanism can introduce a significant performance drop. + */ +static int zvol_enforce_quotas = B_TRUE; + +/* * Filesystem and Snapshot Limits * ------------------------------ * @@ -121,8 +131,6 @@ * dsl_dir_init_fs_ss_count(). */ -extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); - static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); typedef struct ddulrt_arg { @@ -162,7 +170,7 @@ dsl_dir_evict_async(void *dbu) int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag, dsl_dir_t **ddp) + const char *tail, const void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; @@ -209,8 +217,6 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, } } - dsl_dir_snap_cmtime_update(dd); - if (dsl_dir_phys(dd)->dd_parent_obj) { err = dsl_dir_hold_obj(dp, dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, @@ -272,6 +278,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, } } + if (dsl_dir_is_zapified(dd)) { + inode_timespec_t t = {0}; + (void) zap_lookup(dp->dp_meta_objset, ddobj, + DD_FIELD_SNAPSHOTS_CHANGED, + sizeof (uint64_t), + sizeof (inode_timespec_t) / sizeof (uint64_t), + &t); + dd->dd_snap_cmtime = t; + } + dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, &dd->dd_dbuf); winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); @@ -322,7 +338,7 @@ errout: } void -dsl_dir_rele(dsl_dir_t *dd, void *tag) +dsl_dir_rele(dsl_dir_t *dd, const void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); @@ -337,7 +353,7 @@ dsl_dir_rele(dsl_dir_t *dd, void *tag) * the spa. */ void -dsl_dir_async_rele(dsl_dir_t *dd, void *tag) +dsl_dir_async_rele(dsl_dir_t *dd, const void *tag) { dprintf_dd(dd, "%s\n", ""); spa_async_close(dd->dd_pool->dp_spa, tag); @@ -422,8 +438,7 @@ getcomponent(const char *path, char *component, const char **nextp) } else if (p[0] == '/') { if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); - component[p - path] = '\0'; + (void) strlcpy(component, path, p - path + 1); p++; } else if (p[0] == '@') { /* @@ -434,8 +449,7 @@ getcomponent(const char *path, char *component, const char **nextp) return (SET_ERROR(EINVAL)); if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); - (void) strncpy(component, path, p - path); - component[p - path] = '\0'; + (void) strlcpy(component, path, p - path + 1); } else { panic("invalid p=%p", (void *)p); } @@ -451,7 +465,7 @@ getcomponent(const char *path, char *component, const char **nextp) * (*tail)[0] == '@' means that the last component is a snapshot. */ int -dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, +dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag, dsl_dir_t **ddp, const char **tailp) { char *buf; @@ -764,6 +778,8 @@ dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, */ if (secpolicy_zfs_proc(cr, proc) == 0) return (ENFORCE_NEVER); +#else + (void) proc; #endif if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) @@ -801,7 +817,7 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t limit, count; - char *count_prop; + const char *count_prop; enforce_res_t enforce; int err = 0; @@ -809,6 +825,18 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); + if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { + /* + * We don't enforce the limit for temporary snapshots. This is + * indicated by a NULL cred_t argument. + */ + if (cr == NULL) + return (0); + + count_prop = DD_FIELD_SNAPSHOT_COUNT; + } else { + count_prop = DD_FIELD_FILESYSTEM_COUNT; + } /* * If we're allowed to change the limit, don't enforce the limit * e.g. this can happen if a snapshot is taken by an administrative @@ -828,19 +856,6 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, if (delta == 0) return (0); - if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { - /* - * We don't enforce the limit for temporary snapshots. This is - * indicated by a NULL cred_t argument. - */ - if (cr == NULL) - return (0); - - count_prop = DD_FIELD_SNAPSHOT_COUNT; - } else { - count_prop = DD_FIELD_FILESYSTEM_COUNT; - } - /* * If an ancestor has been provided, stop checking the limit once we * hit that dir. We need this during rename so that we don't overcount @@ -1172,10 +1187,9 @@ dsl_dir_space_towrite(dsl_dir_t *dd) ASSERT(MUTEX_HELD(&dd->dd_lock)); - for (int i = 0; i < TXG_SIZE; i++) { + for (int i = 0; i < TXG_SIZE; i++) space += dd->dd_space_towrite[i & TXG_MASK]; - ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); - } + return (space); } @@ -1262,6 +1276,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, uint64_t quota; struct tempreserve *tr; int retval; + uint64_t ext_quota; uint64_t ref_rsrv; top_of_function: @@ -1305,7 +1320,9 @@ top_of_function: * If this transaction will result in a net free of space, * we want to let it through. */ - if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) + if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 || + (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL && + zvol_enforce_quotas == B_FALSE)) quota = UINT64_MAX; else quota = dsl_dir_phys(dd)->dd_quota; @@ -1320,7 +1337,6 @@ top_of_function: * we're very close to full, this will allow a steady trickle of * removes to get through. */ - uint64_t deferred = 0; if (dd->dd_parent == NULL) { uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, (netfree) ? @@ -1335,21 +1351,31 @@ top_of_function: /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual - * on-disk is over quota and there are no pending changes (which - * may free up space for us). + * on-disk is over quota and there are no pending changes + * or deferred frees (which may free up space for us). */ - if (used_on_disk + est_inflight >= quota) { - if (est_inflight > 0 || used_on_disk < quota || - (retval == ENOSPC && used_on_disk < quota + deferred)) - retval = ERESTART; + ext_quota = quota >> 5; + if (quota == UINT64_MAX) + ext_quota = 0; + + if (used_on_disk >= quota) { + if (retval == ENOSPC && (used_on_disk - quota) < + dsl_pool_deferred_space(dd->dd_pool)) { + retval = SET_ERROR(ERESTART); + } + /* Quota exceeded */ + mutex_exit(&dd->dd_lock); + DMU_TX_STAT_BUMP(dmu_tx_quota); + return (retval); + } else if (used_on_disk + est_inflight >= quota + ext_quota) { dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " - "quota=%lluK tr=%lluK err=%d\n", + "quota=%lluK tr=%lluK\n", (u_longlong_t)used_on_disk>>10, (u_longlong_t)est_inflight>>10, - (u_longlong_t)quota>>10, (u_longlong_t)asize>>10, retval); + (u_longlong_t)quota>>10, (u_longlong_t)asize>>10); mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); - return (SET_ERROR(retval)); + return (SET_ERROR(ERESTART)); } /* We need to up our estimated delta before dropping dd_lock */ @@ -1377,10 +1403,9 @@ top_of_function: ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); first = B_FALSE; goto top_of_function; - - } else { - return (0); } + + return (0); } /* @@ -1459,7 +1484,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) if (tr_cookie == NULL) return; - while ((tr = list_head(tr_list)) != NULL) { + while ((tr = list_remove_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, @@ -1469,7 +1494,6 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) } else { arc_tempreserve_clear(tr->tr_size); } - list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } @@ -1896,10 +1920,10 @@ typedef struct dsl_valid_rename_arg { int nest_delta; } dsl_valid_rename_arg_t; -/* ARGSUSED */ static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { + (void) dp; dsl_valid_rename_arg_t *dvra = arg; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; @@ -2094,6 +2118,8 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname)); + ASSERT3P(mynewname, !=, NULL); + /* Log this before we change the name. */ spa_history_log_internal_dd(dd, "rename", tx, "-> %s", ddra->ddra_newname); @@ -2236,13 +2262,25 @@ dsl_dir_snap_cmtime(dsl_dir_t *dd) } void -dsl_dir_snap_cmtime_update(dsl_dir_t *dd) +dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx) { + dsl_pool_t *dp = dmu_tx_pool(tx); inode_timespec_t t; - gethrestime(&t); + mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; + if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_EXTENSIBLE_DATASET)) { + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t ddobj = dd->dd_object; + dsl_dir_zapify(dd, tx); + VERIFY0(zap_update(mos, ddobj, + DD_FIELD_SNAPSHOTS_CHANGED, + sizeof (uint64_t), + sizeof (inode_timespec_t) / sizeof (uint64_t), + &t, tx)); + } mutex_exit(&dd->dd_lock); } @@ -2396,6 +2434,7 @@ dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, * The delete queue is ZPL specific, and libzpool doesn't have * it. It doesn't make sense to wait for it. */ + (void) ds; *in_progress = B_FALSE; break; #endif @@ -2448,3 +2487,7 @@ dsl_dir_cancel_waiters(dsl_dir_t *dd) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); #endif + +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW, + "Enable strict ZVOL quota enforcment"); diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index 1350f1329564..342ec5c15c79 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -99,32 +99,31 @@ * capped at zfs_dirty_data_max_max. It can also be overridden with a module * parameter. */ -unsigned long zfs_dirty_data_max = 0; -unsigned long zfs_dirty_data_max_max = 0; -int zfs_dirty_data_max_percent = 10; -int zfs_dirty_data_max_max_percent = 25; +uint64_t zfs_dirty_data_max = 0; +uint64_t zfs_dirty_data_max_max = 0; +uint_t zfs_dirty_data_max_percent = 10; +uint_t zfs_dirty_data_max_max_percent = 25; /* - * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. - * Once it is reached, write operation is blocked, - * until log data is cleared out after txg sync. + * The upper limit of TX_WRITE log data. Write operations are throttled + * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ -unsigned long zfs_wrlog_data_max = 0; +uint64_t zfs_wrlog_data_max = 0; /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than * zfs_vdev_async_write_active_min_dirty_percent. */ -int zfs_dirty_data_sync_percent = 20; +static uint_t zfs_dirty_data_sync_percent = 20; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. */ -int zfs_delay_min_dirty_percent = 60; +uint_t zfs_delay_min_dirty_percent = 60; /* * This controls how quickly the delay approaches infinity. @@ -139,12 +138,7 @@ int zfs_delay_min_dirty_percent = 60; * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the * multiply in dmu_tx_delay(). */ -unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; - -/* - * This determines the number of threads used by the dp_sync_taskq. - */ -int zfs_sync_taskq_batch_pct = 75; +uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; /* * These tunables determine the behavior of how zil_itxg_clean() is @@ -172,9 +166,9 @@ int zfs_sync_taskq_batch_pct = 75; * Additionally, the number of threads used by the taskq can be * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. */ -int zfs_zil_clean_taskq_nthr_pct = 100; -int zfs_zil_clean_taskq_minalloc = 1024; -int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; +static int zfs_zil_clean_taskq_nthr_pct = 100; +static int zfs_zil_clean_taskq_minalloc = 1024; +static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) @@ -215,9 +209,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); - dp->dp_sync_taskq = taskq_create("dp_sync_taskq", - zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, - TASKQ_THREADS_CPU_PCT); + dp->dp_sync_taskq = spa_sync_tq_create(spa, "dp_sync_taskq"); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, @@ -332,7 +324,6 @@ dsl_pool_open(dsl_pool_t *dp) /* * We might not have created the remap bpobj yet. */ - err = 0; } else { goto out; } @@ -411,7 +402,7 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); - taskq_destroy(dp->dp_sync_taskq); + spa_sync_tq_destroy(dp->dp_spa); /* * We can't set retry to TRUE since we're explicitly specifying @@ -439,10 +430,8 @@ dsl_pool_close(dsl_pool_t *dp) taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); - if (dp->dp_blkstats != NULL) { - mutex_destroy(&dp->dp_blkstats->zab_lock); + if (dp->dp_blkstats != NULL) vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); - } kmem_free(dp, sizeof (dsl_pool_t)); } @@ -476,8 +465,8 @@ dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) } dsl_pool_t * -dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp, - uint64_t txg) +dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)), + dsl_crypto_params_t *dcp, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); @@ -623,15 +612,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = - zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp, txg); } boolean_t -dsl_pool_wrlog_over_max(dsl_pool_t *dp) +dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { - return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); + uint64_t delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + + return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void @@ -641,6 +633,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); + /* Compact per-CPU sums after the big change. */ + (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG @@ -664,12 +659,15 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) return (B_TRUE); } +#else +#define dsl_early_sync_task_verify(dp, txg) \ + ((void) sizeof (dp), (void) sizeof (txg), B_TRUE) #endif void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { - zio_t *zio; + zio_t *rio; /* root zio for all dirty dataset syncs */ dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; @@ -699,9 +697,10 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } /* - * Write out all dirty blocks of dirty datasets. + * Write out all dirty blocks of dirty datasets. Note, this could + * create a very large (+10k) zio tree. */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because @@ -710,9 +709,9 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Update the long range free counter after @@ -743,13 +742,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + rio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); - dsl_dataset_sync(ds, zio, tx); + dsl_dataset_sync(ds, rio, tx); /* * Release any key mappings created by calls to @@ -762,7 +761,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } - VERIFY0(zio_wait(zio)); + VERIFY0(zio_wait(rio)); /* * Now that the datasets have been completely synced, we can @@ -783,6 +782,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) } dsl_dataset_sync_done(ds, tx); + dmu_buf_rele(ds->ds_dbuf, ds); } while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { @@ -947,24 +947,30 @@ dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) return (quota); } +uint64_t +dsl_pool_deferred_space(dsl_pool_t *dp) +{ + return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa))); +} + boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - mutex_enter(&dp->dp_lock); - uint64_t dirty = dp->dp_dirty_total; - mutex_exit(&dp->dp_lock); - - return (dirty > delay_min_bytes); + /* + * We are not taking the dp_lock here and few other places, since torn + * reads are unlikely: on 64-bit systems due to register size and on + * 32-bit due to memory constraints. Pool-wide locks in hot path may + * be too expensive, while we do not need a precise result here. + */ + return (dp->dp_dirty_total > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { - ASSERT(MUTEX_HELD(&dp->dp_lock)); - uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; @@ -1007,7 +1013,6 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) mutex_exit(&dp->dp_lock); } -/* ARGSUSED */ static int upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { @@ -1042,7 +1047,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); + ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp)); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ @@ -1098,7 +1103,6 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } -/* ARGSUSED */ static int upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { @@ -1377,7 +1381,7 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, */ int -dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) +dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp) { spa_t *spa; int error; @@ -1391,14 +1395,14 @@ dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) } void -dsl_pool_rele(dsl_pool_t *dp, void *tag) +dsl_pool_rele(dsl_pool_t *dp, const void *tag) { dsl_pool_config_exit(dp, tag); spa_close(dp->dp_spa, tag); } void -dsl_pool_config_enter(dsl_pool_t *dp, void *tag) +dsl_pool_config_enter(dsl_pool_t *dp, const void *tag) { /* * We use a "reentrant" reader-writer lock, but not reentrantly. @@ -1417,14 +1421,14 @@ dsl_pool_config_enter(dsl_pool_t *dp, void *tag) } void -dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) +dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag) { ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter_read_prio(&dp->dp_config_rwlock, tag); } void -dsl_pool_config_exit(dsl_pool_t *dp, void *tag) +dsl_pool_config_exit(dsl_pool_t *dp, const void *tag) { rrw_exit(&dp->dp_config_rwlock, tag); } @@ -1444,37 +1448,33 @@ dsl_pool_config_held_writer(dsl_pool_t *dp) EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); -/* BEGIN CSTYLED */ /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ -ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD, "Max percent of RAM allowed to be dirty"); /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ -ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD, "zfs_dirty_data_max upper bound as % of RAM"); -ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW, "Transaction delay threshold"); -ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW, "Determines the dirty space limit"); -ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW, "The size limit of write-transaction zil log data"); /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ -ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); -ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); -ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, - "Max percent of CPUs that are used to sync dirty data"); - ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq"); @@ -1483,4 +1483,3 @@ ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW, "Max number of taskq entries that are cached"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c index dfa04d7681be..99f931cd8632 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_prop.c +++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #include <sys/zfs_context.h> @@ -41,6 +42,7 @@ #define ZPROP_INHERIT_SUFFIX "$inherit" #define ZPROP_RECVD_SUFFIX "$recvd" +#define ZPROP_IUV_SUFFIX "$iuv" static int dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) @@ -57,7 +59,7 @@ dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { if (intsz != 1) return (SET_ERROR(EOVERFLOW)); - (void) strncpy(buf, zfs_prop_default_string(prop), + (void) strlcpy(buf, zfs_prop_default_string(prop), numints); } else { if (intsz != 8 || numints < 1) @@ -69,6 +71,17 @@ dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) return (0); } +static int +dsl_prop_known_index(zfs_prop_t prop, uint64_t value) +{ + const char *str = NULL; + if (prop != ZPROP_CONT && prop != ZPROP_INVAL && + zfs_prop_get_type(prop) == PROP_TYPE_INDEX) + return (!zfs_prop_index_to_string(prop, value, &str)); + + return (-1); +} + int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) @@ -81,6 +94,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, boolean_t inheriting = B_FALSE; char *inheritstr; char *recvdstr; + char *iuvstr; ASSERT(dsl_pool_config_held(dd->dd_pool)); @@ -88,9 +102,10 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, setpoint[0] = '\0'; prop = zfs_name_to_prop(propname); - inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop)); inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX); /* * Note: dd may become NULL, therefore we shouldn't dereference it @@ -105,6 +120,18 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, inheriting = B_TRUE; } + /* Check for a iuv value. */ + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + iuvstr, intsz, numints, buf); + if (err == 0 && dsl_prop_known_index(prop, + *(uint64_t *)buf) != 1) + err = ENOENT; + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + dsl_dir_name(dd, setpoint); + break; + } + /* Check for a local value. */ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, propname, intsz, numints, buf); @@ -155,6 +182,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, kmem_strfree(inheritstr); kmem_strfree(recvdstr); + kmem_strfree(iuvstr); return (err); } @@ -168,7 +196,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, uint64_t zapobj; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); - inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop)); zapobj = dsl_dataset_phys(ds)->ds_props_obj; if (zapobj != 0) { @@ -504,10 +532,10 @@ dsl_prop_hascb(dsl_dataset_t *ds) return (!list_is_empty(&ds->ds_prop_cbs)); } -/* ARGSUSED */ static int dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { + (void) arg; dsl_dir_t *dd = ds->ds_dir; dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; @@ -647,6 +675,45 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_rele(dd, FTAG); } + +/* + * For newer values in zfs index type properties, we add a new key + * propname$iuv (iuv = Ignore Unknown Values) to the properties zap object + * to store the new property value and store the default value in the + * existing prop key. So that the propname$iuv key is ignored by the older zfs + * versions and the default property value from the existing prop key is + * used. + */ +static void +dsl_prop_set_iuv(objset_t *mos, uint64_t zapobj, const char *propname, + int intsz, int numints, const void *value, dmu_tx_t *tx) +{ + char *iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX); + boolean_t iuv = B_FALSE; + zfs_prop_t prop = zfs_name_to_prop(propname); + + switch (prop) { + case ZFS_PROP_REDUNDANT_METADATA: + if (*(uint64_t *)value == ZFS_REDUNDANT_METADATA_SOME || + *(uint64_t *)value == ZFS_REDUNDANT_METADATA_NONE) + iuv = B_TRUE; + break; + default: + break; + } + + if (iuv) { + VERIFY0(zap_update(mos, zapobj, iuvstr, intsz, numints, + value, tx)); + uint64_t val = zfs_prop_default_numeric(prop); + VERIFY0(zap_update(mos, zapobj, propname, intsz, numints, + &val, tx)); + } else { + zap_remove(mos, zapobj, iuvstr, tx); + } + kmem_strfree(iuvstr); +} + void dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, zprop_source_t source, int intsz, int numints, const void *value, @@ -659,6 +726,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, const char *valstr = NULL; char *inheritstr; char *recvdstr; + char *iuvstr; char *tbuf = NULL; int err; uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); @@ -692,6 +760,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX); switch ((int)source) { case ZPROP_SRC_NONE: @@ -709,11 +778,14 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, /* * remove propname$inherit * set propname -> value + * set propname$iuv -> new property value */ err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); VERIFY0(zap_update(mos, zapobj, propname, intsz, numints, value, tx)); + (void) dsl_prop_set_iuv(mos, zapobj, propname, intsz, + numints, value, tx); break; case ZPROP_SRC_INHERITED: /* @@ -723,6 +795,8 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, iuvstr, tx); + ASSERT(err == 0 || err == ENOENT); if (version >= SPA_VERSION_RECVD_PROPS && dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { dummy = 0; @@ -749,7 +823,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, ASSERT(err == 0 || err == ENOENT); err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); - fallthrough; + zfs_fallthrough; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * remove propname$recvd @@ -763,6 +837,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, kmem_strfree(inheritstr); kmem_strfree(recvdstr); + kmem_strfree(iuvstr); /* * If we are left with an empty snap zap we can destroy it. @@ -881,7 +956,7 @@ dsl_props_set_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(ENAMETOOLONG)); } if (nvpair_type(elem) == DATA_TYPE_STRING) { - char *valstr = fnvpair_value_string(elem); + const char *valstr = fnvpair_value_string(elem); if (strlen(valstr) >= (version < SPA_VERSION_STMF_PROP ? ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { @@ -1012,6 +1087,14 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, propname = za.za_name; source = setpoint; + + /* Skip if iuv entries are preset. */ + valstr = kmem_asprintf("%s%s", propname, + ZPROP_IUV_SUFFIX); + err = zap_contains(mos, propobj, valstr); + kmem_strfree(valstr); + if (err == 0) + continue; } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { /* Skip explicitly inherited entries. */ continue; @@ -1019,8 +1102,8 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, if (flags & DSL_PROP_GET_LOCAL) continue; - (void) strncpy(buf, za.za_name, (suffix - za.za_name)); - buf[suffix - za.za_name] = '\0'; + (void) strlcpy(buf, za.za_name, + MIN(sizeof (buf), suffix - za.za_name + 1)); propname = buf; if (!(flags & DSL_PROP_GET_RECEIVED)) { @@ -1044,6 +1127,16 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, source = ((flags & DSL_PROP_GET_INHERITING) ? setpoint : ZPROP_SOURCE_VAL_RECVD); + } else if (strcmp(suffix, ZPROP_IUV_SUFFIX) == 0) { + (void) strlcpy(buf, za.za_name, + MIN(sizeof (buf), suffix - za.za_name + 1)); + propname = buf; + source = setpoint; + prop = zfs_name_to_prop(propname); + + if (dsl_prop_known_index(prop, + za.za_first_integer) != 1) + continue; } else { /* * For backward compatibility, skip suffixes we don't @@ -1055,12 +1148,12 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, prop = zfs_name_to_prop(propname); /* Skip non-inheritable properties. */ - if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && - !zfs_prop_inheritable(prop)) + if ((flags & DSL_PROP_GET_INHERITING) && + prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop)) continue; /* Skip properties not valid for this type. */ - if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && + if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_USERPROP && !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE)) continue; diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index d25c067dfbc1..085cfd3c5691 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -37,6 +37,7 @@ #include <sys/dmu_tx.h> #include <sys/dmu_objset.h> #include <sys/arc.h> +#include <sys/arc_impl.h> #include <sys/zap.h> #include <sys/zio.h> #include <sys/zfs_context.h> @@ -46,12 +47,14 @@ #include <sys/vdev_impl.h> #include <sys/zil_impl.h> #include <sys/zio_checksum.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/sa.h> #include <sys/sa_impl.h> #include <sys/zfeature.h> #include <sys/abd.h> #include <sys/range_tree.h> +#include <sys/dbuf.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif @@ -126,9 +129,20 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_data_disks(vdev_t *vd); +static uint64_t dsl_scan_count_data_disks(spa_t *spa); +static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb); -extern int zfs_vdev_async_write_active_min_dirty_percent; +extern uint_t zfs_vdev_async_write_active_min_dirty_percent; +static int zfs_scan_blkstats = 0; + +/* + * 'zpool status' uses bytes processed per pass to report throughput and + * estimate time remaining. We define a pass to start when the scanning + * phase completes for a sequential resilver. Optionally, this value + * may be used to reset the pass statistics every N txgs to provide an + * estimated completion time based on currently observed performance. + */ +static uint_t zfs_scan_report_txgs = 0; /* * By default zfs will check to ensure it is not over the hard memory @@ -136,7 +150,7 @@ extern int zfs_vdev_async_write_active_min_dirty_percent; * this value can be set to 1 to enable checking before scanning each * block. */ -int zfs_scan_strict_mem_lim = B_FALSE; +static int zfs_scan_strict_mem_lim = B_FALSE; /* * Maximum number of parallelly executed bytes per leaf vdev. We attempt @@ -146,41 +160,57 @@ int zfs_scan_strict_mem_lim = B_FALSE; * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ -unsigned long zfs_scan_vdev_limit = 4 << 20; +static uint64_t zfs_scan_vdev_limit = 16 << 20; + +static uint_t zfs_scan_issue_strategy = 0; -int zfs_scan_issue_strategy = 0; -int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ -unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ +/* don't queue & sort zios, go direct */ +static int zfs_scan_legacy = B_FALSE; +static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ /* * fill_weight is non-tunable at runtime, so we copy it at module init from * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would * break queue sorting. */ -int zfs_scan_fill_weight = 3; +static uint_t zfs_scan_fill_weight = 3; static uint64_t fill_weight; /* See dsl_scan_should_clear() for details on the memory limit tunables */ -uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ -uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ -int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ -int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ - -int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ -int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ -int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ -int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ -int zfs_scan_checkpoint_intval = 7200; /* in seconds */ +static const uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ +static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ + + +/* fraction of physmem */ +static uint_t zfs_scan_mem_lim_fact = 20; + +/* fraction of mem lim above */ +static uint_t zfs_scan_mem_lim_soft_fact = 20; + +/* minimum milliseconds to scrub per txg */ +static uint_t zfs_scrub_min_time_ms = 1000; + +/* minimum milliseconds to obsolete per txg */ +static uint_t zfs_obsolete_min_time_ms = 500; + +/* minimum milliseconds to free per txg */ +static uint_t zfs_free_min_time_ms = 1000; + +/* minimum milliseconds to resilver per txg */ +static uint_t zfs_resilver_min_time_ms = 3000; + +static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */ int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */ -int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ -enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ +static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ -unsigned long zfs_async_block_max_blocks = ULONG_MAX; +static uint64_t zfs_async_block_max_blocks = UINT64_MAX; /* max number of dedup blocks to free in a single TXG */ -unsigned long zfs_max_async_dedup_frees = 100000; +static uint64_t zfs_max_async_dedup_frees = 100000; -int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */ +/* set to disable resilver deferring */ +static int zfs_resilver_disable_defer = B_FALSE; /* * We wait a few txgs after importing a pool to begin scanning so that @@ -201,7 +231,10 @@ int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */ /* * Enable/disable the processing of the free_bpobj object. */ -int zfs_free_bpobj_enabled = 1; +static int zfs_free_bpobj_enabled = 1; + +/* Error blocks to be scrubbed in one txg. */ +static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { @@ -219,9 +252,9 @@ typedef struct { /* * This controls what conditions are placed on dsl_scan_sync_state(): - * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 - * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. - * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise + * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0 + * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0. + * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise * write out the scn_phys_cached version. * See dsl_scan_sync_state for details. */ @@ -264,7 +297,7 @@ typedef struct scan_io { * event of an error. This array must go at the end of the * struct to allow this for the variable number of elements. */ - dva_t sio_dva[0]; + dva_t sio_dva[]; } scan_io_t; #define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) @@ -279,12 +312,14 @@ typedef struct scan_io { struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ + zio_t *q_zio; /* scn_zio_root child for waiting on IO */ /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; - zfs_btree_t q_exts_by_size; + zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; + uint64_t q_last_ext_addr; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; @@ -392,25 +427,25 @@ dsl_scan_resilvering(dsl_pool_t *dp) static inline void sio2bp(const scan_io_t *sio, blkptr_t *bp) { - bzero(bp, sizeof (*bp)); + memset(bp, 0, sizeof (*bp)); bp->blk_prop = sio->sio_blk_prop; - bp->blk_phys_birth = sio->sio_phys_birth; - bp->blk_birth = sio->sio_birth; + BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth); + BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth); bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; ASSERT3U(sio->sio_nr_dvas, >, 0); ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); - bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t)); + memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { sio->sio_blk_prop = bp->blk_prop; - sio->sio_phys_birth = bp->blk_phys_birth; - sio->sio_birth = bp->blk_birth; + sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp); + sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp); sio->sio_cksum = bp->blk_cksum; sio->sio_nr_dvas = BP_GET_NDVAS(bp); @@ -447,14 +482,16 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) /* * Calculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); + mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); @@ -481,8 +518,16 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), + ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys); + + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); + /* * Detect if the pool contains the signature of #2094. If it * does properly update the scn->scn_phys structure and notify @@ -507,7 +552,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) return (EOVERFLOW); } - bcopy(zaptmp, &scn->scn_phys, + memcpy(&scn->scn_phys, zaptmp, SCAN_PHYS_NUMINTS * sizeof (uint64_t)); scn->scn_phys.scn_flags = overflow; @@ -529,7 +574,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) * counter to how far we've scanned. We know we're consistent * up to here. */ - scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + scn->scn_issued_before_pass = scn->scn_phys.scn_examined - + scn->scn_phys.scn_skipped; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { @@ -566,7 +612,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) } } - bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); /* reload the queue into the in-core state */ if (scn->scn_phys.scn_queue_obj != 0) { @@ -585,6 +631,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) } spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); + return (0); } @@ -599,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp) scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); + mutex_destroy(&scn->scn_queue_lock); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); @@ -631,18 +680,96 @@ dsl_scan_scrubbing(const dsl_pool_t *dp) } boolean_t +dsl_errorscrubbing(const dsl_pool_t *dp) +{ + dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys; + + return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING && + errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB); +} + +boolean_t +dsl_errorscrub_is_paused(const dsl_scan_t *scn) +{ + return (dsl_errorscrubbing(scn->scn_dp) && + scn->errorscrub_phys.dep_paused_flags); +} + +boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { return (dsl_scan_scrubbing(scn->scn_dp) && scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } +static void +dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + scn->errorscrub_phys.dep_cursor = + zap_cursor_serialize(&scn->errorscrub_cursor); + + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, + &scn->errorscrub_phys, tx)); +} + +static void +dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(!dsl_scan_is_running(scn)); + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + scn->errorscrub_phys.dep_func = *funcp; + scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING; + scn->errorscrub_phys.dep_start_time = gethrestime_sec(); + scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa); + scn->errorscrub_phys.dep_examined = 0; + scn->errorscrub_phys.dep_errors = 0; + scn->errorscrub_phys.dep_cursor = 0; + zap_cursor_init_serialized(&scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + vdev_config_dirty(spa->spa_root_vdev); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START); + + dsl_errorscrub_sync_state(scn, tx); + + spa_history_log_internal(spa, "error scrub setup", tx, + "func=%u mintxg=%u maxtxg=%llu", + *funcp, 0, (u_longlong_t)tx->tx_txg); +} + +static int +dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) { + return (SET_ERROR(EBUSY)); + } + + if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) { + return (ECANCELED); + } + return (0); +} + /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always * want to write out the record, only when it is "safe" to do so. This safety * condition is achieved by making sure that the sorting queues are empty - * (scn_bytes_pending == 0). When this condition is not true, the sync'd state + * (scn_queues_pending == 0). When this condition is not true, the sync'd state * is inconsistent with how much actual scanning progress has been made. The * kind of sync to be performed is specified by the sync_type argument. If the * sync is optional, we only sync if the queues are empty. If the sync is @@ -665,8 +792,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) int i; spa_t *spa = scn->scn_dp->dp_spa; - ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); - if (scn->scn_bytes_pending == 0) { + ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0); + if (scn->scn_queues_pending == 0) { for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; @@ -688,7 +815,7 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); - bcopy(&scn->scn_phys, &scn->scn_phys_cached, + memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); if (scn->scn_checkpointing) @@ -705,14 +832,15 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) } } -/* ARGSUSED */ int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) || + dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(EBUSY)); return (0); @@ -721,6 +849,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; @@ -729,7 +858,15 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); - bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); + + /* + * If we are starting a fresh scrub, we erase the error scrub + * information from disk. + */ + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + dsl_errorscrub_sync_state(scn, tx); + scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; @@ -744,6 +881,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); + vdev_scan_stat_init(spa->spa_root_vdev); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; @@ -791,13 +929,19 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) /* back to the generic stuff */ - if (dp->dp_blkstats == NULL) { - dp->dp_blkstats = - vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); - mutex_init(&dp->dp_blkstats->zab_lock, NULL, - MUTEX_DEFAULT, NULL); + if (zfs_scan_blkstats) { + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + memset(&dp->dp_blkstats->zab_type, 0, + sizeof (dp->dp_blkstats->zab_type)); + } else { + if (dp->dp_blkstats) { + vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + dp->dp_blkstats = NULL; + } } - bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; @@ -805,7 +949,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); - bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); @@ -816,8 +960,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) } /* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. + * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub, + * error scrub or resilver. Can also be called to resume a paused scrub or + * error scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) @@ -843,6 +988,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) return (0); } + if (func == POOL_SCAN_ERRORSCRUB) { + if (dsl_errorscrub_is_paused(dp->dp_scan)) { + /* + * got error scrub start cmd, resume paused error scrub. + */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_RESUME); + return (ECANCELED); + } + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync, + &func, 0, ZFS_SPACE_CHECK_RESERVED)); + } + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, @@ -851,7 +1016,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (SET_ERROR(ECANCELED)); } - return (SET_ERROR(err)); } @@ -859,7 +1023,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } -/* ARGSUSED */ +static void +dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + if (complete) { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH); + spa_history_log_internal(spa, "error scrub done", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } else { + spa_history_log_internal(spa, "error scrub canceled", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } + + scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED; + spa->spa_scrub_active = B_FALSE; + spa_errlog_rotate(spa); + scn->errorscrub_phys.dep_end_time = gethrestime_sec(); + zap_cursor_fini(&scn->errorscrub_cursor); + + if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) + spa->spa_errata = 0; + + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); +} + static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { @@ -920,13 +1110,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, - "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); else if (!complete) spa_history_log_internal(spa, "scan cancelled", tx, - "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); else spa_history_log_internal(spa, "scan done", tx, - "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_active = B_FALSE; @@ -989,7 +1179,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { spa_history_log_internal(spa, "starting deferred resilver", tx, "errors=%llu", - (u_longlong_t)spa_get_errlog_size(spa)); + (u_longlong_t)spa_approx_errlog_size(spa)); spa_async_request(spa, SPA_ASYNC_RESILVER); } @@ -1006,10 +1196,96 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); } -/* ARGSUSED */ +static int +dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* + * can't pause a error scrub when there is no in-progress + * error scrub. + */ + if (!dsl_errorscrubbing(dp)) + return (SET_ERROR(ENOENT)); + + /* can't pause a paused error scrub */ + if (dsl_errorscrub_is_paused(scn)) + return (SET_ERROR(EBUSY)); + } else if (*cmd != POOL_SCRUB_NORMAL) { + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +static void +dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + spa->spa_scan_pass_errorscrub_pause = gethrestime_sec(); + scn->errorscrub_phys.dep_paused_flags = B_TRUE; + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED); + } else { + ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); + if (dsl_errorscrub_is_paused(scn)) { + /* + * We need to keep track of how much time we spend + * paused per pass so that we can adjust the error scrub + * rate shown in the output of 'zpool status'. + */ + spa->spa_scan_pass_errorscrub_spent_paused += + gethrestime_sec() - + spa->spa_scan_pass_errorscrub_pause; + + spa->spa_scan_pass_errorscrub_pause = 0; + scn->errorscrub_phys.dep_paused_flags = B_FALSE; + + zap_cursor_init_serialized( + &scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + dsl_errorscrub_sync_state(scn, tx); + } + } +} + +static int +dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + /* can't cancel a error scrub when there is no one in-progress */ + if (!dsl_errorscrubbing(scn->scn_dp)) + return (SET_ERROR(ENOENT)); + return (0); +} + +static void +dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + dsl_errorscrub_done(scn, B_FALSE, tx); + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_ABORT); +} + static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (!dsl_scan_is_running(scn)) @@ -1017,10 +1293,10 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); @@ -1031,6 +1307,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) int dsl_scan_cancel(dsl_pool_t *dp) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync, + NULL, 3, ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } @@ -1097,6 +1378,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_pause_resume_check, + dsl_errorscrub_pause_resume_sync, &cmd, 3, + ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); @@ -1204,7 +1491,7 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; - ASSERT0(scn->scn_bytes_pending); + ASSERT0(scn->scn_queues_pending); ASSERT(scn->scn_phys.scn_queue_obj != 0); VERIFY0(dmu_object_free(dp->dp_meta_objset, @@ -1275,9 +1562,13 @@ dsl_scan_should_clear(dsl_scan_t *scn) mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { - /* # extents in exts_by_size = # in exts_by_addr */ + /* + * # of extents in exts_by_addr = # in exts_by_size. + * B-tree efficiency is ~75%, but can be as low as 50%. + */ mused += zfs_btree_numnodes(&queue->q_exts_by_size) * - sizeof (range_seg_gap_t) + queue->q_sio_memused; + ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) * + 3 / 2) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } @@ -1285,7 +1576,7 @@ dsl_scan_should_clear(dsl_scan_t *scn) dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); if (mused == 0) - ASSERT0(scn->scn_bytes_pending); + ASSERT0(scn->scn_queues_pending); /* * If we are above our hard limit, we need to clear out memory. @@ -1335,12 +1626,13 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; - int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + uint64_t dirty_min_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_min_dirty_percent / 100; + uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; if ((NSEC2MSEC(scan_time_ns) > mintime && - (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || @@ -1378,16 +1670,52 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) return (B_FALSE); } +static boolean_t +dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) +{ + /* + * We suspend if: + * - we have scrubbed for at least the minimum time (default 1 sec + * for error scrub), someone is explicitly waiting for this txg + * to complete, or we have used up all of the time in the txg + * timeout (default 5 sec). + * or + * - the spa is shutting down because this pool is being exported + * or the machine is rebooting. + */ + uint64_t curr_time_ns = gethrtime(); + uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int mintime = zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(error_scrub_time_ns) > mintime && + (txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("error scrub suspending at bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + } + return (B_TRUE); + } + return (B_FALSE); +} + typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; -/* ARGSUSED */ static int dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { + (void) zilog; zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; @@ -1395,7 +1723,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + if (BP_IS_HOLE(bp) || + BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1404,7 +1733,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ - if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) + if (claim_txg == 0 && + BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], @@ -1414,11 +1744,11 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, return (0); } -/* ARGSUSED */ static int dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, uint64_t claim_txg) { + (void) zilog; if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; @@ -1430,7 +1760,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || - bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1438,9 +1768,10 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, * already txg sync'ed (but this log block contains * other records that are not synced) */ - if (claim_txg == 0 || bp->blk_birth < claim_txg) + if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) return (0); + ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); @@ -1491,7 +1822,7 @@ scan_prefetch_queue_compare(const void *a, const void *b) } static void -scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) +scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag) { if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { zfs_refcount_destroy(&spc->spc_refcnt); @@ -1500,7 +1831,7 @@ scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) } static scan_prefetch_ctx_t * -scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) +scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag) { scan_prefetch_ctx_t *spc; @@ -1522,7 +1853,7 @@ scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) } static void -scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) +scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag) { zfs_refcount_add(&spc->spc_refcnt, tag); } @@ -1576,7 +1907,8 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) return; - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || + if (BP_IS_HOLE(bp) || + BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; @@ -1643,6 +1975,7 @@ static void dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *private) { + (void) zio; scan_prefetch_ctx_t *spc = private; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; @@ -1687,6 +2020,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (OBJSET_BUF_HAS_USERUSED(buf)) { + if (OBJSET_BUF_HAS_PROJECTUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_projectused_dnode, zb->zb_objset, + DMU_PROJECTUSED_OBJECT); + } dsl_scan_prefetch_dnode(scn, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); @@ -1702,7 +2040,6 @@ out: scan_prefetch_ctx_rele(spc, scn); } -/* ARGSUSED */ static void dsl_scan_prefetch_thread(void *arg) { @@ -1748,10 +2085,16 @@ dsl_scan_prefetch_thread(void *arg) zio_flags |= ZIO_FLAG_RAW; } + /* We don't need data L1 buffer since we do not prefetch L0. */ + blkptr_t *bp = &spic->spic_bp; + if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET) + flags |= ARC_FLAG_NO_BUF; + /* issue the prefetch asynchronously */ - (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, - &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, - ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); + (void) arc_read(scn->scn_zio_root, spa, bp, + dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB, + zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } @@ -1788,24 +2131,23 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, /* * If we found the block we're trying to resume from, or - * we went past it to a different object, zero it out to - * indicate that it's OK to start checking for suspending - * again. + * we went past it, zero it out to indicate that it's OK + * to start checking for suspending again. */ - if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || - zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + if (zbookmark_subtree_tbd(dnp, zb, + &scn->scn_phys.scn_bookmark)) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); - bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb)); } } return (B_FALSE); } -static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, +static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); inline __attribute__((always_inline)) static void dsl_scan_visitdnode( @@ -1822,11 +2164,25 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; ASSERT(!BP_IS_REDACTED(bp)); + /* + * There is an unlikely case of encountering dnodes with contradicting + * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created + * or modified before commit 4254acb was merged. As it is not possible + * to know which of the two is correct, report an error. + */ + if (dnp != NULL && + dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { + scn->scn_phys.scn_errors++; + spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + return (SET_ERROR(EINVAL)); + } + if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; @@ -1834,7 +2190,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; - err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -1862,7 +2218,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, zio_flags |= ZIO_FLAG_RAW; } - err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -1881,7 +2237,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, objset_phys_t *osp; arc_buf_t *buf; - err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; @@ -1912,6 +2268,15 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); + } else if (!zfs_blkptr_verify(spa, bp, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + /* + * Sanity check the block pointer contents, this is handled + * by arc_read() for the cases above. + */ + scn->scn_phys.scn_errors++; + spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + return (SET_ERROR(EINVAL)); } return (0); @@ -1947,12 +2312,11 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, * first 5; we want them to be useful. */ static void -dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, +dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; - blkptr_t *bp_toread = NULL; if (dsl_scan_check_suspend(scn, zb)) return; @@ -1962,19 +2326,6 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, scn->scn_visited_this_txg++; - /* - * This debugging is commented out to conserve stack space. This - * function is called recursively and the debugging adds several - * bytes to the stack for each call. It can be commented back in - * if required to debug an issue in dsl_scan_visitbp(). - * - * dprintf_bp(bp, - * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", - * ds, ds ? ds->ds_object : 0, - * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, - * bp); - */ - if (BP_IS_HOLE(bp)) { scn->scn_holes_this_txg++; return; @@ -1986,16 +2337,28 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, return; } - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + /* + * Check if this block contradicts any filesystem flags. + */ + spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS; + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) + ASSERT(dsl_dataset_feature_is_active(ds, f)); + + f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) + ASSERT(dsl_dataset_feature_is_active(ds, f)); + + f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); + if (f != SPA_FEATURE_NONE) + ASSERT(dsl_dataset_feature_is_active(ds, f)); + + if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } - bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); - *bp_toread = *bp; - - if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) - goto out; + if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0) + return; /* * If dsl_scan_ddt() has already visited this block, it will have @@ -2005,7 +2368,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { scn->scn_ddt_contained_this_txg++; - goto out; + return; } /* @@ -2015,15 +2378,12 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, * Don't scan it now unless we need to because something * under it was modified. */ - if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; - goto out; + return; } scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); - -out: - kmem_free(bp_toread, sizeof (blkptr_t)); } static void @@ -2340,7 +2700,6 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) dsl_scan_sync_state(scn, tx, SYNC_CACHED); } -/* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { @@ -2366,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) return (err); ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } @@ -2525,10 +2886,10 @@ out: dsl_dataset_rele(ds, FTAG); } -/* ARGSUSED */ static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { + (void) arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; @@ -2558,22 +2919,23 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) ds = prev; } + mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); + mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } -/* ARGSUSED */ void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx) { + (void) tx; const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_phys_t zb = { 0 }; - int p; if (!dsl_scan_is_running(scn)) return; @@ -2592,7 +2954,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, if (scn->scn_done_txg != 0) return; - for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) continue; @@ -2609,7 +2971,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * - * We leverage the fact that the dde's replication class (enum ddt_class) + * We leverage the fact that the dde's replication class (ddt_class_t) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * @@ -2640,12 +3002,10 @@ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; - ddt_entry_t dde; + ddt_entry_t dde = {{{{0}}}}; int error; uint64_t n = 0; - bzero(&dde, sizeof (ddt_entry_t)); - while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; @@ -2708,7 +3068,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; @@ -2738,7 +3097,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ - bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); + memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t)); /* * Keep pulling things out of the dataset avl queue. Updates to the @@ -2777,8 +3136,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } static uint64_t -dsl_scan_count_data_disks(vdev_t *rvd) +dsl_scan_count_data_disks(spa_t *spa) { + vdev_t *rvd = spa->spa_root_vdev; uint64_t i, leaves = 0; for (i = 0; i < rvd->vdev_children; i++) { @@ -2820,12 +3180,13 @@ scan_io_queue_check_suspend(dsl_scan_t *scn) uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; - int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + uint64_t dirty_min_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_min_dirty_percent / 100; + uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; return ((NSEC2MSEC(scan_time_ns) > mintime && - (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa)); @@ -2844,7 +3205,6 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; - int64_t bytes_issued = 0; boolean_t suspended = B_FALSE; while ((sio = list_head(io_list)) != NULL) { @@ -2856,16 +3216,12 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) } sio2bp(sio, &bp); - bytes_issued += SIO_GET_ASIZE(sio); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); sio_free(sio); } - - atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); - return (suspended); } @@ -2910,6 +3266,8 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); + if (avl_is_empty(&queue->q_sios_by_addr)) + atomic_add_64(&queue->q_scn->scn_queues_pending, -1); queue->q_sio_memused -= SIO_GET_MUSED(sio); bytes_issued += SIO_GET_ASIZE(sio); @@ -2931,12 +3289,13 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) range_tree_resize_segment(queue->q_exts_by_addr, rs, SIO_GET_OFFSET(sio), rs_get_end(rs, queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); - + queue->q_last_ext_addr = SIO_GET_OFFSET(sio); return (B_TRUE); } else { uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); + queue->q_last_ext_addr = -1; return (B_FALSE); } } @@ -2961,31 +3320,8 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); - /* handle tunable overrides */ - if (scn->scn_checkpointing || scn->scn_clearing) { - if (zfs_scan_issue_strategy == 1) { - return (range_tree_first(rt)); - } else if (zfs_scan_issue_strategy == 2) { - /* - * We need to get the original entry in the by_addr - * tree so we can modify it. - */ - range_seg_t *size_rs = - zfs_btree_first(&queue->q_exts_by_size, NULL); - if (size_rs == NULL) - return (NULL); - uint64_t start = rs_get_start(size_rs, rt); - uint64_t size = rs_get_end(size_rs, rt) - start; - range_seg_t *addr_rs = range_tree_find(rt, start, - size); - ASSERT3P(addr_rs, !=, NULL); - ASSERT3U(rs_get_start(size_rs, rt), ==, - rs_get_start(addr_rs, rt)); - ASSERT3U(rs_get_end(size_rs, rt), ==, - rs_get_end(addr_rs, rt)); - return (addr_rs); - } - } + if (!scn->scn_checkpointing && !scn->scn_clearing) + return (NULL); /* * During normal clearing, we want to issue our largest segments @@ -2996,28 +3332,42 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) * so the way we are sorted now is as good as it will ever get. * In this case, we instead switch to issuing extents in LBA order. */ - if (scn->scn_checkpointing) { + if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) || + zfs_scan_issue_strategy == 1) return (range_tree_first(rt)); - } else if (scn->scn_clearing) { - /* - * We need to get the original entry in the by_addr - * tree so we can modify it. - */ - range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, - NULL); - if (size_rs == NULL) - return (NULL); - uint64_t start = rs_get_start(size_rs, rt); - uint64_t size = rs_get_end(size_rs, rt) - start; - range_seg_t *addr_rs = range_tree_find(rt, start, size); - ASSERT3P(addr_rs, !=, NULL); - ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs, - rt)); - ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt)); - return (addr_rs); - } else { - return (NULL); + + /* + * Try to continue previous extent if it is not completed yet. After + * shrink in scan_io_queue_gather() it may no longer be the best, but + * otherwise we leave shorter remnant every txg. + */ + uint64_t start; + uint64_t size = 1ULL << rt->rt_shift; + range_seg_t *addr_rs; + if (queue->q_last_ext_addr != -1) { + start = queue->q_last_ext_addr; + addr_rs = range_tree_find(rt, start, size); + if (addr_rs != NULL) + return (addr_rs); } + + /* + * Nothing to continue, so find new best extent. + */ + uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL); + if (v == NULL) + return (NULL); + queue->q_last_ext_addr = start = *v << rt->rt_shift; + + /* + * We need to get the original entry in the by_addr tree so we can + * modify it. + */ + addr_rs = range_tree_find(rt, start, size); + ASSERT3P(addr_rs, !=, NULL); + ASSERT3U(rs_get_start(addr_rs, rt), ==, start); + ASSERT3U(rs_get_end(addr_rs, rt), >, start); + return (addr_rs); } static void @@ -3026,15 +3376,19 @@ scan_io_queues_run_one(void *arg) dsl_scan_io_queue_t *queue = arg; kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; boolean_t suspended = B_FALSE; - range_seg_t *rs = NULL; - scan_io_t *sio = NULL; + range_seg_t *rs; + scan_io_t *sio; + zio_t *zio; list_t sio_list; ASSERT(queue->q_scn->scn_is_sorted); list_create(&sio_list, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_list_node)); + zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa, + NULL, NULL, NULL, ZIO_FLAG_CANFAIL); mutex_enter(q_lock); + queue->q_zio = zio; /* Calculate maximum in-flight bytes for this vdev. */ queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit * @@ -3049,12 +3403,12 @@ scan_io_queues_run_one(void *arg) /* loop until we run out of time or sios */ while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { uint64_t seg_start = 0, seg_end = 0; - boolean_t more_left = B_TRUE; + boolean_t more_left; ASSERT(list_is_empty(&sio_list)); /* loop while we still have sios left to process in this rs */ - while (more_left) { + do { scan_io_t *first_sio, *last_sio; /* @@ -3083,7 +3437,7 @@ scan_io_queues_run_one(void *arg) if (suspended) break; - } + } while (more_left); /* update statistics for debugging purposes */ scan_io_queues_update_seg_stats(queue, seg_start, seg_end); @@ -3096,12 +3450,12 @@ scan_io_queues_run_one(void *arg) * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ - while ((sio = list_head(&sio_list)) != NULL) { - list_remove(&sio_list, sio); + while ((sio = list_remove_head(&sio_list)) != NULL) scan_io_queue_insert_impl(queue, sio); - } + queue->q_zio = NULL; mutex_exit(q_lock); + zio_nowait(zio); list_destroy(&sio_list); } @@ -3122,7 +3476,7 @@ scan_io_queues_run(dsl_scan_t *scn) ASSERT(scn->scn_is_sorted); ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); - if (scn->scn_bytes_pending == 0) + if (scn->scn_queues_pending == 0) return; if (scn->scn_taskq == NULL) { @@ -3290,6 +3644,19 @@ dsl_scan_active(dsl_scan_t *scn) return ((used != 0) || (clones_left)); } +boolean_t +dsl_errorscrub_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + if (dsl_errorscrubbing(scn->scn_dp)) + return (B_TRUE); + return (B_FALSE); +} + static boolean_t dsl_scan_check_deferred(vdev_t *vd) { @@ -3439,11 +3806,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) scn->scn_dedup_frees_this_txg = 0; /* - * Write out changes to the DDT that may be required as a - * result of the blocks freed. This ensures that the DDT - * is clean when a scrub/resilver runs. + * Write out changes to the DDT and the BRT that may be required + * as a result of the blocks freed. This ensures that the DDT + * and the BRT are clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); + brt_sync(spa, tx->tx_txg); } if (err != 0) return (err); @@ -3505,6 +3873,387 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) return (0); } +static void +name_to_bookmark(char *buf, zbookmark_phys_t *zb) +{ + zb->zb_objset = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static void +read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + objset_t *os; + if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0) + return; + + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + /* + * If the key is not loaded dbuf_dnode_findbp() will error out with + * EACCES. However in that case dnode_hold() will eventually call + * dbuf_read()->zio_wait() which may call spa_log_error(). This will + * lead to a deadlock due to us holding the mutex spa_errlist_lock. + * Avoid this by checking here if the keys are loaded, if not return. + * If the keys are not loaded the head_errlog feature is meaningless + * as we cannot figure out the birth txg of the block pointer. + */ + if (dsl_dataset_get_keystatus(ds->ds_dir) == + ZFS_KEYSTATUS_UNAVAILABLE) { + dsl_dataset_rele(ds, FTAG); + return; + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL, + NULL); + + if (error) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + return; + } + + if (!error && BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + return; + } + + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB; + + /* If it's an intent log block, failure is expected. */ + if (zb.zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + ASSERT(!BP_IS_EMBEDDED(&bp)); + scan_exec_io(dp, &bp, zio_flags, &zb, NULL); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +/* + * We keep track of the scrubbed error blocks in "count". This will be used + * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This + * function is modelled after check_filesystem(). + */ +static int +scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, + int *count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + + error = find_birth_txg(ds, zep, &latest_txg); + + /* + * If find_birth_txg() errors out, then err on the side of caution and + * proceed. In worst case scenario scrub all objects. If zep->zb_birth + * is 0 (e.g. in case of encryption with unloaded keys) also proceed to + * scrub all objects. + */ + if (error == 0 && zep->zb_birth == latest_txg) { + /* Block neither free nor re written. */ + zbookmark_phys_t zb; + zep_to_zb(fs, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + + check_snapshot = B_FALSE; + } else if (error == 0) { + txg_to_consider = latest_txg; + } + + /* + * Retrieve the number of snapshots if the dataset is not a snapshot. + */ + uint64_t snap_count = 0; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + } + + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + dsl_dataset_rele(ds, FTAG); + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + if (error != 0) + return (error); + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + continue; + } + + boolean_t affected = B_TRUE; + if (check_snapshot) { + uint64_t blk_txg; + error = find_birth_txg(ds, zep, &blk_txg); + + /* + * Scrub the snapshot also when zb_birth == 0 or when + * find_birth_txg() returns an error. + */ + affected = (error == 0 && zep->zb_birth == blk_txg) || + (error != 0) || (zep->zb_birth == 0); + } + + /* Scrub snapshots. */ + if (affected) { + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (EFAULT); + } + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + return (0); +} + +void +dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Only process scans in sync pass 1. + */ + + if (spa_sync_pass(spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) { + return; + } + + if (dsl_scan_resilvering(scn->scn_dp)) { + /* cancel the error scrub if resilver started */ + dsl_scan_cancel(scn->scn_dp); + return; + } + + spa->spa_scrub_active = B_TRUE; + scn->scn_sync_start_time = gethrtime(); + + /* + * zfs_scan_suspend_progress can be set to disable scrub progress. + * See more detailed comment in dsl_scan_sync(). + */ + if (zfs_scan_suspend_progress) { + uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; + int mintime = zfs_scrub_min_time_ms; + + while (zfs_scan_suspend_progress && + !txg_sync_waiting(scn->scn_dp) && + !spa_shutting_down(scn->scn_dp->dp_spa) && + NSEC2MSEC(scan_time_ns) < mintime) { + delay(hz); + scan_time_ns = gethrtime() - scn->scn_sync_start_time; + } + return; + } + + int i = 0; + zap_attribute_t *za; + zbookmark_phys_t *zb; + boolean_t limit_exceeded = B_FALSE; + + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + name_to_bookmark(za->za_name, zb); + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_pool_config_enter(dp, FTAG); + read_by_block_level(scn, *zb); + dsl_pool_config_exit(dp, FTAG); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined += 1; + scn->errorscrub_phys.dep_to_examine -= 1; + i++; + if (i == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, zb)) { + limit_exceeded = B_TRUE; + break; + } + } + + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + return; + } + + int error = 0; + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + uint64_t head_ds_err_obj = za->za_first_integer; + uint64_t head_ds; + name_to_object(za->za_name, &head_ds); + boolean_t config_held = B_FALSE; + uint64_t top_affected_fs; + + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { + + name_to_errphys(head_ds_attr->za_name, &head_ds_block); + + /* + * In case we are called from spa_sync the pool + * config is already held. + */ + if (!dsl_pool_config_held(dp)) { + dsl_pool_config_enter(dp, FTAG); + config_held = B_TRUE; + } + + error = find_top_affected_fs(spa, + head_ds, &head_ds_block, &top_affected_fs); + if (error) + break; + + error = scrub_filesystem(spa, top_affected_fs, + &head_ds_block, &i); + + if (error == SET_ERROR(EFAULT)) { + limit_exceeded = B_TRUE; + break; + } + } + + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + if (config_held) + dsl_pool_config_exit(dp, FTAG); + } + + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); +} + /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we @@ -3608,8 +4357,9 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) */ if (zfs_scan_suspend_progress) { uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? - zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + uint_t mintime = (scn->scn_phys.scn_func == + POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : + zfs_scrub_min_time_ms; while (zfs_scan_suspend_progress && !txg_sync_waiting(scn->scn_dp) && @@ -3622,6 +4372,16 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) } /* + * Disabled by default, set zfs_scan_report_txgs to report + * average performance over the last zfs_scan_report_txgs TXGs. + */ + if (zfs_scan_report_txgs != 0 && + tx->tx_txg % zfs_scan_report_txgs == 0) { + scn->scn_issued_before_pass += spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); + } + + /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from * a checkpoint after a reboot. @@ -3680,12 +4440,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) taskqid_t prefetch_tqid; /* - * Recalculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { @@ -3749,12 +4510,15 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; + scn->scn_issued_before_pass += + spa->spa_scan_pass_issued; + spa_scan_stat_init(spa); } zfs_dbgmsg("scan complete for %s txg %llu", spa->spa_name, (longlong_t)tx->tx_txg); } - } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { + } else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) { ASSERT(scn->scn_clearing); /* need to issue scrubbing IOs from per-vdev queues */ @@ -3784,7 +4548,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) spa->spa_name); ASSERT3U(scn->scn_done_txg, !=, 0); ASSERT0(spa->spa_scrub_inflight); - ASSERT0(scn->scn_bytes_pending); + ASSERT0(scn->scn_queues_pending); dsl_scan_done(scn, B_TRUE, tx); sync_type = SYNC_MANDATORY; } @@ -3793,10 +4557,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) } static void -count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) +count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all) { - int i; - /* * Don't count embedded bp's, since we already did the work of * scanning these when we scanned the containing block. @@ -3811,18 +4573,22 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) * zio code will only try the first one unless there is an issue. * Therefore, we should only count the first DVA for these IOs. */ - if (scn->scn_is_sorted) { - atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, - DVA_GET_ASIZE(&bp->blk_dva[0])); - } else { - spa_t *spa = scn->scn_dp->dp_spa; + atomic_add_64(&spa->spa_scan_pass_issued, + all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); +} - for (i = 0; i < BP_GET_NDVAS(bp); i++) { - atomic_add_64(&spa->spa_scan_pass_issued, - DVA_GET_ASIZE(&bp->blk_dva[i])); - } - } +static void +count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all) +{ + if (BP_IS_EMBEDDED(bp)) + return; + atomic_add_64(&scn->scn_phys.scn_skipped, + all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); +} +static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. @@ -3830,9 +4596,7 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) if (zab == NULL) return; - mutex_enter(&zab->zab_lock); - - for (i = 0; i < 4; i++) { + for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; @@ -3867,28 +4631,27 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) break; } } - - mutex_exit(&zab->zab_lock); } static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; - int64_t asize = SIO_GET_ASIZE(sio); dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + if (unlikely(avl_is_empty(&queue->q_sios_by_addr))) + atomic_add_64(&scn->scn_queues_pending, 1); if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ - atomic_add_64(&scn->scn_bytes_pending, -asize); sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); queue->q_sio_memused += SIO_GET_MUSED(sio); - range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); + range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), + SIO_GET_ASIZE(sio)); } /* @@ -3901,7 +4664,6 @@ static void scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { - dsl_scan_t *scn = queue->q_scn; scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); @@ -3911,13 +4673,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, sio->sio_flags = zio_flags; sio->sio_zb = *zb; - /* - * Increment the bytes pending counter now so that we can't - * get an integer underflow in case the worker processes the - * zio before we get to incrementing this counter. - */ - atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio)); - + queue->q_last_ext_addr = -1; scan_io_queue_insert_impl(queue, sio); } @@ -3967,15 +4723,15 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + uint64_t phys_birth = BP_GET_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - + count_block(dp->dp_blkstats, bp); if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { - count_block(scn, dp->dp_blkstats, bp); + count_block_skipped(scn, bp, B_TRUE); return (0); } @@ -4003,8 +4759,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, * Keep track of how much data we've examined so that * zpool(8) status can make useful progress reports. */ - scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); - spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); + uint64_t asize = DVA_GET_ASIZE(dva); + scn->scn_phys.scn_examined += asize; + spa->spa_scan_pass_exam += asize; /* if it's a resilver, this may not be in the target range */ if (!needs_io) @@ -4015,7 +4772,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { - count_block(scn, dp->dp_blkstats, bp); + count_block_skipped(scn, bp, B_TRUE); } /* do not relocate this block */ @@ -4047,7 +4804,14 @@ dsl_scan_scrub_done(zio_t *zio) if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + if (dsl_errorscrubbing(spa->spa_dsl_pool) && + !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan + ->errorscrub_phys.dep_errors); + } else { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys + .scn_errors); + } } } @@ -4066,6 +4830,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); abd_t *data = abd_alloc_for_io(size, B_FALSE); + zio_t *pio; if (queue == NULL) { ASSERT3U(scn->scn_maxinflight_bytes, >, 0); @@ -4074,6 +4839,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); + pio = scn->scn_zio_root; } else { kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; @@ -4082,12 +4848,14 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) cv_wait(&queue->q_zio_cv, q_lock); queue->q_inflight_bytes += BP_GET_PSIZE(bp); + pio = queue->q_zio; mutex_exit(q_lock); } - count_block(scn, dp->dp_blkstats, bp); - zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size, - dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); + ASSERT(pio != NULL); + count_block_issued(spa, bp, queue == NULL); + zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done, + queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* @@ -4121,33 +4889,93 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, * extents that are more completely filled (in a 3:2 ratio) vs just larger. * Note that as an optimization, we replace multiplication and division by * 100 with bitshifting by 7 (which effectively multiplies and divides by 128). + * + * Since we do not care if one extent is only few percent better than another, + * compress the score into 6 bits via binary logarithm AKA highbit64() and + * put into otherwise unused due to ashift high bits of offset. This allows + * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them + * with single operation. Plus it makes scrubs more sequential and reduces + * chances that minor extent change move it within the B-tree. */ +__attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { - const range_seg_gap_t *rsa = x, *rsb = y; + const uint64_t *a = x, *b = y; - uint64_t sa = rsa->rs_end - rsa->rs_start; - uint64_t sb = rsb->rs_end - rsb->rs_start; - uint64_t score_a, score_b; + return (TREE_CMP(*a, *b)); +} - score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * - fill_weight * rsa->rs_fill) >> 7); - score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * - fill_weight * rsb->rs_fill) >> 7); +ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, + ext_size_compare) - if (score_a > score_b) - return (-1); - if (score_a == score_b) { - if (rsa->rs_start < rsb->rs_start) - return (-1); - if (rsa->rs_start == rsb->rs_start) - return (0); - return (1); - } - return (1); +static void +ext_size_create(range_tree_t *rt, void *arg) +{ + (void) rt; + zfs_btree_t *size_tree = arg; + + zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, + sizeof (uint64_t)); } +static void +ext_size_destroy(range_tree_t *rt, void *arg) +{ + (void) rt; + zfs_btree_t *size_tree = arg; + ASSERT0(zfs_btree_numnodes(size_tree)); + + zfs_btree_destroy(size_tree); +} + +static uint64_t +ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg) +{ + (void) rt; + uint64_t size = rsg->rs_end - rsg->rs_start; + uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) * + fill_weight * rsg->rs_fill) >> 7); + ASSERT3U(rt->rt_shift, >=, 8); + return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start); +} + +static void +ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + zfs_btree_t *size_tree = arg; + ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); + uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); + zfs_btree_add(size_tree, &v); +} + +static void +ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + zfs_btree_t *size_tree = arg; + ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); + uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); + zfs_btree_remove(size_tree, &v); +} + +static void +ext_size_vacate(range_tree_t *rt, void *arg) +{ + zfs_btree_t *size_tree = arg; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + ext_size_create(rt, arg); +} + +static const range_tree_ops_t ext_size_ops = { + .rtop_create = ext_size_create, + .rtop_destroy = ext_size_destroy, + .rtop_add = ext_size_add, + .rtop_remove = ext_size_remove, + .rtop_vacate = ext_size_vacate +}; + /* * Comparator for the q_sios_by_addr tree. Sorting is simply performed * based on LBA-order (from lowest to highest). @@ -4170,9 +4998,10 @@ scan_io_queue_create(vdev_t *vd) q->q_scn = scn; q->q_vd = vd; q->q_sio_memused = 0; + q->q_last_ext_addr = -1; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); - q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, - &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); + q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP, + &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); @@ -4190,21 +5019,20 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; void *cookie = NULL; - int64_t bytes_dequeued = 0; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + if (!avl_is_empty(&queue->q_sios_by_addr)) + atomic_add_64(&scn->scn_queues_pending, -1); while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); - bytes_dequeued += SIO_GET_ASIZE(sio); queue->q_sio_memused -= SIO_GET_MUSED(sio); sio_free(sio); } ASSERT0(queue->q_sio_memused); - atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); avl_destroy(&queue->q_sios_by_addr); @@ -4300,28 +5128,22 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) sio_free(srch_sio); if (sio != NULL) { - int64_t asize = SIO_GET_ASIZE(sio); blkptr_t tmpbp; /* Got it while it was cold in the queue */ ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); - ASSERT3U(size, ==, asize); + ASSERT3U(size, ==, SIO_GET_ASIZE(sio)); avl_remove(&queue->q_sios_by_addr, sio); + if (avl_is_empty(&queue->q_sios_by_addr)) + atomic_add_64(&scn->scn_queues_pending, -1); queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); - /* - * We only update scn_bytes_pending in the cold path, - * otherwise it will already have been accounted for as - * part of the zio's execution. - */ - atomic_add_64(&scn->scn_bytes_pending, -asize); - - /* count the block as though we issued it */ + /* count the block as though we skipped it */ sio2bp(sio, &tmpbp); - count_block(scn, dp->dp_blkstats, &tmpbp); + count_block_skipped(scn, &tmpbp, B_FALSE); sio_free(sio); } @@ -4379,20 +5201,19 @@ dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW, "Max bytes in flight per leaf vdev for scrubs and resilvers"); -ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW, "Min millisecs to scrub per txg"); -ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW, "Min millisecs to obsolete per txg"); -ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW, "Min millisecs to free per txg"); -ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW, "Min millisecs to resilver per txg"); ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW, @@ -4404,40 +5225,48 @@ ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, "Set to disable scrub prefetching"); -ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW, "Max number of blocks freed in one txg"); -ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW, "Max number of dedup blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, "Enable processing of the free_bpobj"); -ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW, + "Enable block statistics calculation during scrub"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW, "Fraction of RAM for scan hard limit"); -ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW, - "IO issuing strategy during scrubbing. " - "0 = default, 1 = LBA, 2 = size"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW, + "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size"); ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, "Scrub using legacy non-sequential method"); -ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW, "Scan progress on-disk checkpointing interval"); -ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW, "Max gap in bytes between sequential scrub / resilver I/Os"); -ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW, "Fraction of hard limit used as soft limit"); ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, "Tunable to attempt to reduce lock contention"); -ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); +ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, + "Tunable to report resilver performance over the last N txgs"); + ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); + +ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW, + "Error blocks to be scrubbed in one txg"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dsl_synctask.c b/sys/contrib/openzfs/module/zfs/dsl_synctask.c index 148e8fff2437..409e12884d91 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_synctask.c +++ b/sys/contrib/openzfs/module/zfs/dsl_synctask.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -32,10 +32,10 @@ #define DST_AVG_BLKSHIFT 14 -/* ARGSUSED */ static int dsl_null_checkfunc(void *arg, dmu_tx_t *tx) { + (void) arg, (void) tx; return (0); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_userhold.c b/sys/contrib/openzfs/module/zfs/dsl_userhold.c index 75d153194a00..75953f70f926 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_userhold.c +++ b/sys/contrib/openzfs/module/zfs/dsl_userhold.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -115,7 +115,7 @@ dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { dsl_dataset_t *ds; int error = 0; - char *htag, *name; + const char *htag, *name; /* must be a snapshot */ name = nvpair_name(pair); @@ -346,7 +346,7 @@ dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) return (ret); } -typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, +typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, const void *tag, dsl_dataset_t **dsp); typedef struct dsl_dataset_user_release_arg { @@ -359,7 +359,7 @@ typedef struct dsl_dataset_user_release_arg { /* Place a dataset hold on the snapshot identified by passed dsobj string */ static int -dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, +dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp)); @@ -572,7 +572,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, { dsl_dataset_user_release_arg_t ddura; nvpair_t *pair; - char *pool; + const char *pool; int error; pair = nvlist_next_nvpair(holds, NULL); diff --git a/sys/contrib/openzfs/module/zfs/edonr_zfs.c b/sys/contrib/openzfs/module/zfs/edonr_zfs.c index aa00e1c9417e..db21c9cf197c 100644 --- a/sys/contrib/openzfs/module/zfs/edonr_zfs.c +++ b/sys/contrib/openzfs/module/zfs/edonr_zfs.c @@ -45,7 +45,6 @@ edonr_incremental(void *buf, size_t size, void *arg) /* * Native zio_checksum interface for the Edon-R hash function. */ -/*ARGSUSED*/ void abd_checksum_edonr_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) @@ -54,10 +53,10 @@ abd_checksum_edonr_native(abd_t *abd, uint64_t size, EdonRState ctx; ASSERT(ctx_template != NULL); - bcopy(ctx_template, &ctx, sizeof (ctx)); + memcpy(&ctx, ctx_template, sizeof (ctx)); (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx); EdonRFinal(&ctx, digest); - bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); + memcpy(zcp->zc_word, digest, sizeof (zcp->zc_word)); } /* @@ -89,18 +88,17 @@ abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) * size by double-hashing it (the new salt block will be composed of * H(salt) || H(H(salt))). */ - CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8)); - EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, - salt_block); - EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block + - EDONR_MODE / 8); + _Static_assert(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8), + "Edon-R block size mismatch"); + EdonRHash(salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, salt_block); + EdonRHash(salt_block, EDONR_MODE, salt_block + EDONR_MODE / 8); /* * Feed the new salt block into the hash function - this will serve * as our MAC key. */ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); - EdonRInit(ctx, EDONR_MODE); + EdonRInit(ctx); EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8); return (ctx); } @@ -108,8 +106,8 @@ abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) void abd_checksum_edonr_tmpl_free(void *ctx_template) { - EdonRState *ctx = ctx_template; + EdonRState *ctx = ctx_template; - bzero(ctx, sizeof (*ctx)); + memset(ctx, 0, sizeof (*ctx)); kmem_free(ctx, sizeof (*ctx)); } diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c index b8a1c7c8a5ca..77d87b694a43 100644 --- a/sys/contrib/openzfs/module/zfs/fm.c +++ b/sys/contrib/openzfs/module/zfs/fm.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -68,9 +68,9 @@ #include <sys/condvar.h> #include <sys/zfs_ioctl.h> -int zfs_zevent_len_max = 512; +static uint_t zfs_zevent_len_max = 512; -static int zevent_len_cur = 0; +static uint_t zevent_len_cur = 0; static int zevent_waiters = 0; static int zevent_flags = 0; @@ -148,8 +148,7 @@ zfs_zevent_drain(zevent_t *ev) list_remove(&zevent_list, ev); /* Remove references to this event in all private file data */ - while ((ze = list_head(&ev->ev_ze_list)) != NULL) { - list_remove(&ev->ev_ze_list, ze); + while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) { ze->ze_zevent = NULL; ze->ze_dropped++; } @@ -158,7 +157,7 @@ zfs_zevent_drain(zevent_t *ev) } void -zfs_zevent_drain_all(int *count) +zfs_zevent_drain_all(uint_t *count) { zevent_t *ev; @@ -380,8 +379,7 @@ zfs_zevent_wait(zfs_zevent_t *ze) break; } - error = cv_wait_sig(&zevent_cv, &zevent_lock); - if (signal_pending(current)) { + if (cv_wait_sig(&zevent_cv, &zevent_lock) == 0) { error = SET_ERROR(EINTR); break; } else if (!list_is_empty(&zevent_list)) { @@ -483,21 +481,21 @@ zfs_zevent_destroy(zfs_zevent_t *ze) /* * Wrappers for FM nvlist allocators */ -/* ARGSUSED */ static void * i_fm_alloc(nv_alloc_t *nva, size_t size) { - return (kmem_zalloc(size, KM_SLEEP)); + (void) nva; + return (kmem_alloc(size, KM_SLEEP)); } -/* ARGSUSED */ static void i_fm_free(nv_alloc_t *nva, void *buf, size_t size) { + (void) nva; kmem_free(buf, size); } -const nv_alloc_ops_t fm_mem_alloc_ops = { +static const nv_alloc_ops_t fm_mem_alloc_ops = { .nv_ao_init = NULL, .nv_ao_fini = NULL, .nv_ao_alloc = i_fm_alloc, @@ -702,7 +700,7 @@ i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) case DATA_TYPE_STRING_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_string_array(payload, name, - va_arg(ap, char **), nelem); + va_arg(ap, const char **), nelem); break; case DATA_TYPE_NVLIST: ret = nvlist_add_nvlist(payload, name, @@ -711,7 +709,7 @@ i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) case DATA_TYPE_NVLIST_ARRAY: nelem = va_arg(ap, int); ret = nvlist_add_nvlist_array(payload, name, - va_arg(ap, nvlist_t **), nelem); + va_arg(ap, const nvlist_t **), nelem); break; default: ret = EINVAL; @@ -867,8 +865,10 @@ fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, } va_end(ap); - if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0) + if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, + (const nvlist_t **)pairs, npairs) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + } for (i = 0; i < npairs; i++) fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); @@ -891,7 +891,7 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, uint_t n; int i, j; va_list ap; - char *hcname, *hcid; + const char *hcname, *hcid; if (!fm_fmri_hc_set_common(fmri, version, auth)) return; @@ -953,6 +953,7 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, } atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); + va_end(ap); return; } } @@ -961,8 +962,8 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, /* * Create the fmri hc list */ - if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, - npairs + n) != 0) { + if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, + (const nvlist_t **)pairs, npairs + n) != 0) { atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); return; } @@ -1128,7 +1129,7 @@ fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, if (serial != NULL) { if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, - (char **)&serial, 1) != 0) { + (const char **)&serial, 1) != 0) { atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); } @@ -1340,7 +1341,7 @@ fm_init(void) void fm_fini(void) { - int count; + uint_t count; zfs_ereport_fini(); @@ -1352,7 +1353,7 @@ fm_fini(void) zevent_flags |= ZEVENT_SHUTDOWN; while (zevent_waiters > 0) { mutex_exit(&zevent_lock); - schedule(); + kpreempt(KPREEMPT_SYNC); mutex_enter(&zevent_lock); } mutex_exit(&zevent_lock); @@ -1368,5 +1369,5 @@ fm_fini(void) } #endif /* _KERNEL */ -ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, UINT, ZMOD_RW, "Max event queue length"); diff --git a/sys/contrib/openzfs/module/zfs/gzip.c b/sys/contrib/openzfs/module/zfs/gzip.c index e2c6e59969d6..f3b19446352a 100644 --- a/sys/contrib/openzfs/module/zfs/gzip.c +++ b/sys/contrib/openzfs/module/zfs/gzip.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -28,7 +28,6 @@ #include <sys/debug.h> #include <sys/types.h> -#include <sys/strings.h> #include <sys/qat.h> #include <sys/zio_compress.h> @@ -66,7 +65,7 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) if (d_len != s_len) return (s_len); - bcopy(s_start, d_start, s_len); + memcpy(d_start, s_start, s_len); return (s_len); } /* if hardware compression fails, do it again with software */ @@ -76,17 +75,17 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) if (d_len != s_len) return (s_len); - bcopy(s_start, d_start, s_len); + memcpy(d_start, s_start, s_len); return (s_len); } return ((size_t)dstlen); } -/*ARGSUSED*/ int gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { + (void) n; zlen_t dstlen = d_len; ASSERT(d_len >= s_len); diff --git a/sys/contrib/openzfs/module/zfs/hkdf.c b/sys/contrib/openzfs/module/zfs/hkdf.c index 14265472df7d..580544c8ac1a 100644 --- a/sys/contrib/openzfs/module/zfs/hkdf.c +++ b/sys/contrib/openzfs/module/zfs/hkdf.c @@ -36,7 +36,6 @@ hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, mech.cm_param_len = 0; /* initialize the salt as a crypto key */ - key.ck_format = CRYPTO_KEY_RAW; key.ck_length = CRYPTO_BYTES2BITS(salt_len); key.ck_data = salt; @@ -53,7 +52,7 @@ hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, output_cd.cd_raw.iov_base = (char *)out_buf; output_cd.cd_raw.iov_len = output_cd.cd_length; - ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL); + ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd); if (ret != CRYPTO_SUCCESS) return (SET_ERROR(EIO)); @@ -83,7 +82,6 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, mech.cm_param_len = 0; /* initialize the salt as a crypto key */ - key.ck_format = CRYPTO_KEY_RAW; key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH); key.ck_data = extract_key; @@ -110,19 +108,19 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, T_cd.cd_length = T_len; T_cd.cd_raw.iov_len = T_cd.cd_length; - ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL); + ret = crypto_mac_init(&mech, &key, NULL, &ctx); if (ret != CRYPTO_SUCCESS) return (SET_ERROR(EIO)); - ret = crypto_mac_update(ctx, &T_cd, NULL); + ret = crypto_mac_update(ctx, &T_cd); if (ret != CRYPTO_SUCCESS) return (SET_ERROR(EIO)); - ret = crypto_mac_update(ctx, &info_cd, NULL); + ret = crypto_mac_update(ctx, &info_cd); if (ret != CRYPTO_SUCCESS) return (SET_ERROR(EIO)); - ret = crypto_mac_update(ctx, &c_cd, NULL); + ret = crypto_mac_update(ctx, &c_cd); if (ret != CRYPTO_SUCCESS) return (SET_ERROR(EIO)); @@ -130,11 +128,11 @@ hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, T_cd.cd_length = T_len; T_cd.cd_raw.iov_len = T_cd.cd_length; - ret = crypto_mac_final(ctx, &T_cd, NULL); + ret = crypto_mac_final(ctx, &T_cd); if (ret != CRYPTO_SUCCESS) return (SET_ERROR(EIO)); - bcopy(T, out_buf + pos, + memcpy(out_buf + pos, T, (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos)); pos += SHA512_DIGEST_LENGTH; } diff --git a/sys/contrib/openzfs/module/zfs/lz4.c b/sys/contrib/openzfs/module/zfs/lz4.c index 9da9d9e00635..75a31bf17ea4 100644 --- a/sys/contrib/openzfs/module/zfs/lz4.c +++ b/sys/contrib/openzfs/module/zfs/lz4.c @@ -1,165 +1,50 @@ /* - * LZ4 - Fast LZ compression algorithm - * Header File - * Copyright (C) 2011-2013, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ + +/* + * This file contains unmodified code from lz4 1.9.3's decompressor, plus + * associated macros and constants. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * It also contains a couple of defines from the old lz4.c to make things + * fit together smoothly. * - * You can contact the author at : - * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - * - LZ4 source repository : http://code.google.com/p/lz4/ */ #include <sys/zfs_context.h> -#include <sys/zio_compress.h> -static int real_LZ4_compress(const char *source, char *dest, int isize, - int osize); -static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, +int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize, int maxOutputSize); -static int LZ4_compressCtx(void *ctx, const char *source, char *dest, - int isize, int osize); -static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest, - int isize, int osize); - -static void *lz4_alloc(int flags); -static void lz4_free(void *ctx); - -/*ARGSUSED*/ -size_t -lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, - size_t d_len, int n) -{ - uint32_t bufsiz; - char *dest = d_start; - - ASSERT(d_len >= sizeof (bufsiz)); - - bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len, - d_len - sizeof (bufsiz)); - - /* Signal an error if the compression routine returned zero. */ - if (bufsiz == 0) - return (s_len); - - /* - * The exact compressed size is needed by the decompression routine, - * so it is stored at the start of the buffer. Note that this may be - * less than the compressed block size, which is rounded up to a - * multiple of 1<<ashift. - */ - *(uint32_t *)dest = BE_32(bufsiz); - - return (bufsiz + sizeof (bufsiz)); -} - -/*ARGSUSED*/ -int -lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, - size_t d_len, int n) -{ - const char *src = s_start; - uint32_t bufsiz = BE_IN32(src); - - /* invalid compressed buffer size encoded at start */ - if (bufsiz + sizeof (bufsiz) > s_len) - return (1); - - /* - * Returns 0 on success (decompression function returned non-negative) - * and non-zero on failure (decompression function returned negative). - */ - return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], - d_start, bufsiz, d_len) < 0); -} - -/* - * LZ4 API Description: - * - * Simple Functions: - * real_LZ4_compress() : - * isize : is the input size. Max supported value is ~1.9GB - * return : the number of bytes written in buffer dest - * or 0 if the compression fails (if LZ4_COMPRESSMIN is set). - * note : destination buffer must be already allocated. - * destination buffer must be sized to handle worst cases - * situations (input data not compressible) worst case size - * evaluation is provided by function LZ4_compressBound(). - * - * real_LZ4_uncompress() : - * osize : is the output size, therefore the original size - * return : the number of bytes read in the source buffer. - * If the source stream is malformed, the function will stop - * decoding and return a negative result, indicating the byte - * position of the faulty instruction. This function never - * writes beyond dest + osize, and is therefore protected - * against malicious data packets. - * note : destination buffer must be already allocated - * note : real_LZ4_uncompress() is not used in ZFS so its code - * is not present here. - * - * Advanced Functions - * - * LZ4_compressBound() : - * Provides the maximum size that LZ4 may output in a "worst case" - * scenario (input data not compressible) primarily useful for memory - * allocation of output buffer. - * - * isize : is the input size. Max supported value is ~1.9GB - * return : maximum output size in a "worst case" scenario - * note : this function is limited by "int" range (2^31-1) - * - * LZ4_uncompress_unknownOutputSize() : - * isize : is the input size, therefore the compressed size - * maxOutputSize : is the size of the destination buffer (which must be - * already allocated) - * return : the number of bytes decoded in the destination buffer - * (necessarily <= maxOutputSize). If the source stream is - * malformed, the function will stop decoding and return a - * negative result, indicating the byte position of the faulty - * instruction. This function never writes beyond dest + - * maxOutputSize, and is therefore protected against malicious - * data packets. - * note : Destination buffer must be already allocated. - * This version is slightly slower than real_LZ4_uncompress() - * - * LZ4_compressCtx() : - * This function explicitly handles the CTX memory structure. - * - * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated - * by the caller (either on the stack or using kmem_cache_alloc). Passing - * NULL isn't valid. - * - * LZ4_compress64kCtx() : - * Same as LZ4_compressCtx(), but specific to small inputs (<64KB). - * isize *Must* be <64KB, otherwise the output will be corrupted. - * - * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated - * by the caller (either on the stack or using kmem_cache_alloc). Passing - * NULL isn't valid. - */ /* * Tuning parameters @@ -186,26 +71,6 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, #define NOTCOMPRESSIBLE_CONFIRMATION 6 /* - * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to - * performance for big endian cpu, but the resulting compressed stream - * will be incompatible with little-endian CPU. You can set this option - * to 1 in situations where data will stay within closed environment. - * This option is useless on Little_Endian CPU (such as x86). - */ -/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ - -/* - * CPU Feature Detection - */ - -/* 32 or 64 bits ? */ -#if defined(_LP64) -#define LZ4_ARCH64 1 -#else -#define LZ4_ARCH64 0 -#endif - -/* * Little Endian or Big Endian? * Note: overwrite the below #define if you know your architecture endianness. */ @@ -219,25 +84,44 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, #undef LZ4_BIG_ENDIAN #endif -/* - * Unaligned memory access is automatically enabled for "common" CPU, - * such as x86. For others CPU, the compiler will be more cautious, and - * insert extra code to ensure aligned access is respected. If you know - * your target CPU supports unaligned memory access, you may want to - * force this option manually to improve performance +/*-************************************ +* CPU Feature Detection +**************************************/ +/* LZ4_FORCE_MEMORY_ACCESS + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets which assembly generation depends on alignment. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) */ -#if defined(__ARM_FEATURE_UNALIGNED) -#define LZ4_FORCE_UNALIGNED_ACCESS 1 +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ +# if defined(__GNUC__) && \ + ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define LZ4_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) +# define LZ4_FORCE_MEMORY_ACCESS 1 +# endif #endif /* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support hardware bit count + */ +/* * Illumos : we can't use GCC's __builtin_ctz family of builtins in the * kernel * Linux : we can use GCC's __builtin_ctz family of builtins in the * kernel */ #undef LZ4_FORCE_SW_BITCOUNT -#if defined(__sparc) +#if defined(__sunos__) #define LZ4_FORCE_SW_BITCOUNT #endif @@ -257,10 +141,50 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) -#define expect(expr, value) (__builtin_expect((expr), (value))) +#ifndef LZ4_FORCE_INLINE +# ifdef _MSC_VER /* Visual Studio */ +# define LZ4_FORCE_INLINE static __forceinline +# else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define LZ4_FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define LZ4_FORCE_INLINE static inline +# endif +# else +# define LZ4_FORCE_INLINE static +# endif /* __STDC_VERSION__ */ +# endif /* _MSC_VER */ +#endif /* LZ4_FORCE_INLINE */ + +/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8, + * together with a simple 8-byte copy loop as a fall-back path. + * However, this optimization hurts the decompression speed by >30%, + * because the execution does not go to the optimized loop + * for typical compressible data, and all of the preamble checks + * before going to the fall-back path become useless overhead. + * This optimization happens only with the -O3 flag, and -O2 generates + * a simple 8-byte copy loop. + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8 + * functions are annotated with __attribute__((optimize("O2"))), + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute + * of LZ4_wildCopy8 does not affect the compression speed. + */ +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__) +# define LZ4_FORCE_O2 __attribute__((optimize("O2"))) +# undef LZ4_FORCE_INLINE +# define LZ4_FORCE_INLINE static __inline __attribute__((optimize("O2"),always_inline)) #else -#define expect(expr, value) (expr) +# define LZ4_FORCE_O2 +#endif + +#ifndef expect +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif #endif #ifndef likely @@ -271,814 +195,793 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, #define unlikely(expr) expect((expr) != 0, 0) #endif -#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ - (((x) & 0xffu) << 8))) +#ifndef _KERNEL +#include <stdlib.h> /* malloc, calloc, free */ +#include <string.h> /* memset, memcpy */ +#endif +#define ALLOC(s) malloc(s) +#define ALLOC_AND_ZERO(s) calloc(1,s) +#define FREEMEM(p) free(p) + +#define MEM_INIT(p,v,s) memset((p),(v),(s)) + + +/*-************************************ +* Common Constants +**************************************/ +#define MINMATCH 4 -/* Basic types */ -#define BYTE uint8_t -#define U16 uint16_t -#define U32 uint32_t -#define S32 int32_t -#define U64 uint64_t +#define WILDCOPYLENGTH 8 +#define LASTLITERALS 5 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MATCH_SAFEGUARD_DISTANCE ((2*WILDCOPYLENGTH) - MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */ +#define FASTLOOP_SAFE_DISTANCE 64 -#ifndef LZ4_FORCE_UNALIGNED_ACCESS -#pragma pack(1) +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at compile time */ +# define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ #endif -typedef struct _U16_S { - U16 v; -} U16_S; -typedef struct _U32_S { - U32 v; -} U32_S; -typedef struct _U64_S { - U64 v; -} U64_S; - -#ifndef LZ4_FORCE_UNALIGNED_ACCESS -#pragma pack() +#define LZ4_DISTANCE_ABSOLUTE_MAX 65535 +#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */ +# error "LZ4_DISTANCE_MAX is too big : must be <= 65535" #endif -#define A64(x) (((U64_S *)(x))->v) -#define A32(x) (((U32_S *)(x))->v) -#define A16(x) (((U16_S *)(x))->v) +#define ML_BITS 4 +#define ML_MASK ((1U<<ML_BITS)-1) +#define RUN_BITS (8-ML_BITS) +#define RUN_MASK ((1U<<RUN_BITS)-1) -/* - * Constants - */ -#define MINMATCH 4 +#define DEBUGLOG(l, ...) {} /* disabled */ -#define HASH_LOG COMPRESSIONLEVEL -#define HASHTABLESIZE (1 << HASH_LOG) -#define HASH_MASK (HASHTABLESIZE - 1) +#ifndef assert +#define assert ASSERT +#endif -#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \ - NOTCOMPRESSIBLE_CONFIRMATION : 2) +/*-************************************ +* Types +**************************************/ +#ifndef _KERNEL +#include <limits.h> +#endif +#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +#ifndef _KERNEL +#include <stdint.h> +#endif + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef uintptr_t uptrval; +#else +# if UINT_MAX != 4294967295UL +# error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4" +# endif + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; + typedef size_t uptrval; /* generally true, except OpenVMS-64 */ +#endif -#define COPYLENGTH 8 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH + MINMATCH) -#define MINLENGTH (MFLIMIT + 1) +#if defined(__x86_64__) + typedef U64 reg_t; /* 64-bits in x32 mode */ +#else + typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif -#define MAXD_LOG 16 -#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) +typedef enum { + notLimited = 0, + limitedOutput = 1, + fillOutput = 2 +} limitedOutput_directive; -#define ML_BITS 4 -#define ML_MASK ((1U<<ML_BITS)-1) -#define RUN_BITS (8-ML_BITS) -#define RUN_MASK ((1U<<RUN_BITS)-1) +/*-************************************ +* Reading and writing into memory +**************************************/ -/* - * Architecture-specific macros +/** + * LZ4 relies on memcpy with a constant size being inlined. In freestanding + * environments, the compiler can't assume the implementation of memcpy() is + * standard compliant, so it can't apply its specialized memcpy() inlining + * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze + * memcpy() as if it were standard compliant, so it can inline it in freestanding + * environments. This is needed when decompressing the Linux Kernel, for example. */ -#if LZ4_ARCH64 -#define STEPSIZE 8 -#define UARCH U64 -#define AARCH A64 -#define LZ4_COPYSTEP(s, d) A64(d) = A64(s); d += 8; s += 8; -#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) -#define LZ4_SECURECOPY(s, d, e) if (d < e) LZ4_WILDCOPY(s, d, e) -#define HTYPE U32 -#define INITBASE(base) const BYTE* const base = ip -#else /* !LZ4_ARCH64 */ -#define STEPSIZE 4 -#define UARCH U32 -#define AARCH A32 -#define LZ4_COPYSTEP(s, d) A32(d) = A32(s); d += 4; s += 4; -#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d); -#define LZ4_SECURECOPY LZ4_WILDCOPY -#define HTYPE const BYTE * -#define INITBASE(base) const int base = 0 -#endif /* !LZ4_ARCH64 */ - -#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE)) -#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ - { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; } -#define LZ4_WRITE_LITTLEENDIAN_16(p, i) \ - { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; } +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) #else -#define LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); } -#define LZ4_WRITE_LITTLEENDIAN_16(p, v) { A16(p) = v; p += 2; } +#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size) #endif +static unsigned LZ4_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} -/* Local structures */ -struct refTables { - HTYPE hashTable[HASHTABLESIZE]; -}; +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2) +/* lie to the compiler about data alignment; use with caution */ -/* Macros */ -#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \ - HASH_LOG)) -#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) -#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e); -#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \ - d = e; } +static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; } +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } -/* Private functions */ -#if LZ4_ARCH64 +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1) -static inline int -LZ4_NbCommonBytes(register U64 val) -{ -#if defined(LZ4_BIG_ENDIAN) -#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ - !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clzll(val) >> 3); -#else - int r; - if (!(val >> 32)) { - r = 4; - } else { - r = 0; - val >>= 32; - } - if (!(val >> 16)) { - r += 2; - val >>= 8; - } else { - val >>= 24; - } - r += (!val); - return (r); -#endif -#else -#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ - !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll(val) >> 3); -#else - static const int DeBruijnBytePos[64] = - { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, - 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, - 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, - 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 - }; - return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >> - 58]; -#endif -#endif -} +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign; -#else +static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; } + +static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; } + +#else /* safe and portable access using memcpy() */ -static inline int -LZ4_NbCommonBytes(register U32 val) +static U16 LZ4_read16(const void* memPtr) { -#if defined(LZ4_BIG_ENDIAN) -#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ - !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clz(val) >> 3); -#else - int r; - if (!(val >> 16)) { - r = 2; - val >>= 8; - } else { - r = 0; - val >>= 24; - } - r += (!val); - return (r); -#endif -#else -#if defined(__GNUC__) && (GCC_VERSION >= 304) && \ - !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz(val) >> 3); -#else - static const int DeBruijnBytePos[32] = { - 0, 0, 3, 0, 3, 1, 3, 0, - 3, 2, 2, 1, 3, 2, 0, 1, - 3, 3, 1, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 1, 0, 1, 1 - }; - return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> - 27]; -#endif -#endif + U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val; } -#endif +static void LZ4_write32(void* memPtr, U32 value) +{ + LZ4_memcpy(memPtr, &value, sizeof(value)); +} -/* Compression functions */ +#endif /* LZ4_FORCE_MEMORY_ACCESS */ -/*ARGSUSED*/ -static int -LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize, - int osize) +static U16 LZ4_readLE16(const void* memPtr) { - struct refTables *srt = (struct refTables *)ctx; - HTYPE *HashTable = (HTYPE *) (srt->hashTable); - - const BYTE *ip = (BYTE *) source; - INITBASE(base); - const BYTE *anchor = ip; - const BYTE *const iend = ip + isize; - const BYTE *const oend = (BYTE *) dest + osize; - const BYTE *const mflimit = iend - MFLIMIT; -#define matchlimit (iend - LASTLITERALS) - - BYTE *op = (BYTE *) dest; - - int len, length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - - /* Init */ - if (isize < MINLENGTH) - goto _last_literals; - - /* First Byte */ - HashTable[LZ4_HASH_VALUE(ip)] = ip - base; - ip++; - forwardH = LZ4_HASH_VALUE(ip); - - /* Main Loop */ - for (;;) { - int findMatchAttempts = (1U << skipStrength) + 3; - const BYTE *forwardIp = ip; - const BYTE *ref; - BYTE *token; - - /* Find a match */ - do { - U32 h = forwardH; - int step = findMatchAttempts++ >> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if (unlikely(forwardIp > mflimit)) { - goto _last_literals; - } - - forwardH = LZ4_HASH_VALUE(forwardIp); - ref = base + HashTable[h]; - HashTable[h] = ip - base; - - } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); - - /* Catch up */ - while ((ip > anchor) && (ref > (BYTE *) source) && - unlikely(ip[-1] == ref[-1])) { - ip--; - ref--; - } - - /* Encode Literal length */ - length = ip - anchor; - token = op++; - - /* Check output limit */ - if (unlikely(op + length + (2 + 1 + LASTLITERALS) + - (length >> 8) > oend)) - return (0); - - if (length >= (int)RUN_MASK) { - *token = (RUN_MASK << ML_BITS); - len = length - RUN_MASK; - for (; len > 254; len -= 255) - *op++ = 255; - *op++ = (BYTE)len; - } else - *token = (length << ML_BITS); - - /* Copy Literals */ - LZ4_BLINDCOPY(anchor, op, length); - - _next_match: - /* Encode Offset */ - LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); - - /* Start Counting */ - ip += MINMATCH; - ref += MINMATCH; /* MinMatch verified */ - anchor = ip; - while (likely(ip < matchlimit - (STEPSIZE - 1))) { - UARCH diff = AARCH(ref) ^ AARCH(ip); - if (!diff) { - ip += STEPSIZE; - ref += STEPSIZE; - continue; - } - ip += LZ4_NbCommonBytes(diff); - goto _endCount; - } -#if LZ4_ARCH64 - if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { - ip += 4; - ref += 4; - } -#endif - if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { - ip += 2; - ref += 2; - } - if ((ip < matchlimit) && (*ref == *ip)) - ip++; - _endCount: - - /* Encode MatchLength */ - len = (ip - anchor); - /* Check output limit */ - if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) - return (0); - if (len >= (int)ML_MASK) { - *token += ML_MASK; - len -= ML_MASK; - for (; len > 509; len -= 510) { - *op++ = 255; - *op++ = 255; - } - if (len > 254) { - len -= 255; - *op++ = 255; - } - *op++ = (BYTE)len; - } else - *token += len; - - /* Test end of chunk */ - if (ip > mflimit) { - anchor = ip; - break; - } - /* Fill table */ - HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base; - - /* Test next position */ - ref = base + HashTable[LZ4_HASH_VALUE(ip)]; - HashTable[LZ4_HASH_VALUE(ip)] = ip - base; - if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { - token = op++; - *token = 0; - goto _next_match; - } - /* Prepare next loop */ - anchor = ip++; - forwardH = LZ4_HASH_VALUE(ip); - } - - _last_literals: - /* Encode Last Literals */ - { - int lastRun = iend - anchor; - if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > - oend) - return (0); - if (lastRun >= (int)RUN_MASK) { - *op++ = (RUN_MASK << ML_BITS); - lastRun -= RUN_MASK; - for (; lastRun > 254; lastRun -= 255) { - *op++ = 255; - } - *op++ = (BYTE)lastRun; - } else - *op++ = (lastRun << ML_BITS); - (void) memcpy(op, anchor, iend - anchor); - op += iend - anchor; - } - - /* End */ - return (int)(((char *)op) - dest); + if (LZ4_isLittleEndian()) { + return LZ4_read16(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] + (p[1]<<8)); + } } +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */ +LZ4_FORCE_INLINE +void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e); +} -/* Note : this function is valid only if isize < LZ4_64KLIMIT */ -#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1)) -#define HASHLOG64K (HASH_LOG + 1) -#define HASH64KTABLESIZE (1U << HASHLOG64K) -#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \ - HASHLOG64K)) -#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) +static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; +static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; + + +#ifndef LZ4_FAST_DEC_LOOP +# if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64 +# define LZ4_FAST_DEC_LOOP 1 +# elif defined(__aarch64__) && !defined(__clang__) + /* On aarch64, we disable this optimization for clang because on certain + * mobile chipsets, performance is reduced with clang. For information + * refer to https://github.com/lz4/lz4/pull/707 */ +# define LZ4_FAST_DEC_LOOP 1 +# else +# define LZ4_FAST_DEC_LOOP 0 +# endif +#endif + +#if LZ4_FAST_DEC_LOOP -/*ARGSUSED*/ -static int -LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize, - int osize) +LZ4_FORCE_INLINE void +LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) { - struct refTables *srt = (struct refTables *)ctx; - U16 *HashTable = (U16 *) (srt->hashTable); - - const BYTE *ip = (BYTE *) source; - const BYTE *anchor = ip; - const BYTE *const base = ip; - const BYTE *const iend = ip + isize; - const BYTE *const oend = (BYTE *) dest + osize; - const BYTE *const mflimit = iend - MFLIMIT; -#define matchlimit (iend - LASTLITERALS) - - BYTE *op = (BYTE *) dest; - - int len, length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - /* Init */ - if (isize < MINLENGTH) - goto _last_literals; - - /* First Byte */ - ip++; - forwardH = LZ4_HASH64K_VALUE(ip); - - /* Main Loop */ - for (;;) { - int findMatchAttempts = (1U << skipStrength) + 3; - const BYTE *forwardIp = ip; - const BYTE *ref; - BYTE *token; - - /* Find a match */ - do { - U32 h = forwardH; - int step = findMatchAttempts++ >> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if (forwardIp > mflimit) { - goto _last_literals; - } - - forwardH = LZ4_HASH64K_VALUE(forwardIp); - ref = base + HashTable[h]; - HashTable[h] = ip - base; - - } while (A32(ref) != A32(ip)); - - /* Catch up */ - while ((ip > anchor) && (ref > (BYTE *) source) && - (ip[-1] == ref[-1])) { - ip--; - ref--; - } - - /* Encode Literal length */ - length = ip - anchor; - token = op++; - - /* Check output limit */ - if (unlikely(op + length + (2 + 1 + LASTLITERALS) + - (length >> 8) > oend)) - return (0); - - if (length >= (int)RUN_MASK) { - *token = (RUN_MASK << ML_BITS); - len = length - RUN_MASK; - for (; len > 254; len -= 255) - *op++ = 255; - *op++ = (BYTE)len; - } else - *token = (length << ML_BITS); - - /* Copy Literals */ - LZ4_BLINDCOPY(anchor, op, length); - - _next_match: - /* Encode Offset */ - LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); - - /* Start Counting */ - ip += MINMATCH; - ref += MINMATCH; /* MinMatch verified */ - anchor = ip; - while (ip < matchlimit - (STEPSIZE - 1)) { - UARCH diff = AARCH(ref) ^ AARCH(ip); - if (!diff) { - ip += STEPSIZE; - ref += STEPSIZE; - continue; - } - ip += LZ4_NbCommonBytes(diff); - goto _endCount; - } -#if LZ4_ARCH64 - if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { - ip += 4; - ref += 4; - } -#endif - if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { - ip += 2; - ref += 2; - } - if ((ip < matchlimit) && (*ref == *ip)) - ip++; - _endCount: - - /* Encode MatchLength */ - len = (ip - anchor); - /* Check output limit */ - if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) - return (0); - if (len >= (int)ML_MASK) { - *token += ML_MASK; - len -= ML_MASK; - for (; len > 509; len -= 510) { - *op++ = 255; - *op++ = 255; - } - if (len > 254) { - len -= 255; - *op++ = 255; - } - *op++ = (BYTE)len; - } else - *token += len; - - /* Test end of chunk */ - if (ip > mflimit) { - anchor = ip; - break; - } - /* Fill table */ - HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base; - - /* Test next position */ - ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; - HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base; - if (A32(ref) == A32(ip)) { - token = op++; - *token = 0; - goto _next_match; - } - /* Prepare next loop */ - anchor = ip++; - forwardH = LZ4_HASH64K_VALUE(ip); - } - - _last_literals: - /* Encode Last Literals */ - { - int lastRun = iend - anchor; - if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > - oend) - return (0); - if (lastRun >= (int)RUN_MASK) { - *op++ = (RUN_MASK << ML_BITS); - lastRun -= RUN_MASK; - for (; lastRun > 254; lastRun -= 255) - *op++ = 255; - *op++ = (BYTE)lastRun; - } else - *op++ = (lastRun << ML_BITS); - (void) memcpy(op, anchor, iend - anchor); - op += iend - anchor; - } - - /* End */ - return (int)(((char *)op) - dest); + assert(srcPtr + offset == dstPtr); + if (offset < 8) { + LZ4_write32(dstPtr, 0); /* silence an msan warning when offset==0 */ + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + srcPtr += inc32table[offset]; + LZ4_memcpy(dstPtr+4, srcPtr, 4); + srcPtr -= dec64table[offset]; + dstPtr += 8; + } else { + LZ4_memcpy(dstPtr, srcPtr, 8); + dstPtr += 8; + srcPtr += 8; + } + + LZ4_wildCopy8(dstPtr, srcPtr, dstEnd); } -static int -real_LZ4_compress(const char *source, char *dest, int isize, int osize) +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd + * this version copies two times 16 bytes (instead of one time 32 bytes) + * because it must be compatible with offsets >= 16. */ +LZ4_FORCE_INLINE void +LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd) { - void *ctx; - int result; - - ctx = lz4_alloc(KM_SLEEP); - - /* - * out of kernel memory, gently fall through - this will disable - * compression in zio_compress_data - */ - if (ctx == NULL) - return (0); + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; - memset(ctx, 0, sizeof (struct refTables)); - - if (isize < LZ4_64KLIMIT) - result = LZ4_compress64kCtx(ctx, source, dest, isize, osize); - else - result = LZ4_compressCtx(ctx, source, dest, isize, osize); + do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e); +} - lz4_free(ctx); - return (result); +/* LZ4_memcpy_using_offset() presumes : + * - dstEnd >= dstPtr + MINMATCH + * - there is at least 8 bytes available to write after dstEnd */ +LZ4_FORCE_INLINE void +LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset) +{ + BYTE v[8]; + + assert(dstEnd >= dstPtr + MINMATCH); + + switch(offset) { + case 1: + MEM_INIT(v, *srcPtr, 8); + break; + case 2: + LZ4_memcpy(v, srcPtr, 2); + LZ4_memcpy(&v[2], srcPtr, 2); + LZ4_memcpy(&v[4], v, 4); + break; + case 4: + LZ4_memcpy(v, srcPtr, 4); + LZ4_memcpy(&v[4], srcPtr, 4); + break; + default: + LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset); + return; + } + + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + while (dstPtr < dstEnd) { + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + } } +#endif -/* Decompression functions */ -/* - * Note: The decoding functions real_LZ4_uncompress() and - * LZ4_uncompress_unknownOutputSize() are safe against "buffer overflow" - * attack type. They will never write nor read outside of the provided - * output buffers. LZ4_uncompress_unknownOutputSize() also insures that - * it will never read outside of the input buffer. A corrupted input - * will produce an error result, a negative int, indicating the position - * of the error within input stream. +/*-************************************ +* Local Structures and types +**************************************/ +typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t; + +/** + * This enum distinguishes several different modes of accessing previous + * content in the stream. * - * Note[2]: real_LZ4_uncompress(), referred to above, is not used in ZFS so - * its code is not present here. + * - noDict : There is no preceding content. + * - withPrefix64k : Table entries up to ctx->dictSize before the current blob + * blob being compressed are valid and refer to the preceding + * content (of length ctx->dictSize), which is available + * contiguously preceding in memory the content currently + * being compressed. + * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere + * else in memory, starting at ctx->dictionary with length + * ctx->dictSize. + * - usingDictCtx : Like usingExtDict, but everything concerning the preceding + * content is in a separate context, pointed to by + * ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table + * entries in the current context that refer to positions + * preceding the beginning of the current compression are + * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx + * ->dictSize describe the location and size of the preceding + * content, and matches are found by looking in the ctx + * ->dictCtx->hashTable. */ +typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; -static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; -#if LZ4_ARCH64 -static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; -#endif +/*-******************************* + * Decompression functions + ********************************/ -static int -LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize, - int maxOutputSize) -{ - /* Local Variables */ - const BYTE *restrict ip = (const BYTE *) source; - const BYTE *const iend = ip + isize; - const BYTE *ref; - - BYTE *op = (BYTE *) dest; - BYTE *const oend = op + maxOutputSize; - BYTE *cpy; - - /* Main Loop */ - while (ip < iend) { - unsigned token; - size_t length; - - /* get runlength */ - token = *ip++; - if ((length = (token >> ML_BITS)) == RUN_MASK) { - int s = 255; - while ((ip < iend) && (s == 255)) { - s = *ip++; - if (unlikely(length > (size_t)(length + s))) - goto _output_error; - length += s; - } - } - /* copy literals */ - cpy = op + length; - /* CORNER-CASE: cpy might overflow. */ - if (cpy < op) - goto _output_error; /* cpy was overflowed, bail! */ - if ((cpy > oend - COPYLENGTH) || - (ip + length > iend - COPYLENGTH)) { - if (cpy > oend) - /* Error: writes beyond output buffer */ - goto _output_error; - if (ip + length != iend) - /* - * Error: LZ4 format requires to consume all - * input at this stage - */ - goto _output_error; - (void) memcpy(op, ip, length); - op += length; - /* Necessarily EOF, due to parsing restrictions */ - break; - } - LZ4_WILDCOPY(ip, op, cpy); - ip -= (op - cpy); - op = cpy; - - /* get offset */ - LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); - ip += 2; - if (ref < (BYTE * const) dest) - /* - * Error: offset creates reference outside of - * destination buffer - */ - goto _output_error; - - /* get matchlength */ - if ((length = (token & ML_MASK)) == ML_MASK) { - while (ip < iend) { - int s = *ip++; - if (unlikely(length > (size_t)(length + s))) - goto _output_error; - length += s; - if (s == 255) - continue; - break; - } - } - /* copy repeated sequence */ - if (unlikely(op - ref < STEPSIZE)) { -#if LZ4_ARCH64 - int dec64 = dec64table[op - ref]; -#else - const int dec64 = 0; -#endif - op[0] = ref[0]; - op[1] = ref[1]; - op[2] = ref[2]; - op[3] = ref[3]; - op += 4; - ref += 4; - ref -= dec32table[op - ref]; - A32(op) = A32(ref); - op += STEPSIZE - 4; - ref -= dec64; - } else { - LZ4_COPYSTEP(ref, op); - } - cpy = op + length - (STEPSIZE - 4); - if (cpy > oend - COPYLENGTH) { - if (cpy > oend) - /* - * Error: request to write outside of - * destination buffer - */ - goto _output_error; -#if LZ4_ARCH64 - if ((ref + COPYLENGTH) > oend) -#else - if ((ref + COPYLENGTH) > oend || - (op + COPYLENGTH) > oend) -#endif - goto _output_error; - LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); - while (op < cpy) - *op++ = *ref++; - op = cpy; - if (op == oend) - /* - * Check EOF (should never happen, since - * last 5 bytes are supposed to be literals) - */ - goto _output_error; - continue; - } - LZ4_SECURECOPY(ref, op, cpy); - op = cpy; /* correction */ - } - - /* end of decoding */ - return (int)(((char *)op) - dest); - - /* write overflow error detected */ - _output_error: - return (-1); -} +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; -#ifdef __FreeBSD__ -/* - * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here. - * Should struct refTables get resized this may need to be revisited, hence - * compiler-time asserts. - */ -_Static_assert(sizeof(struct refTables) <= 16384, - "refTables too big for malloc"); -_Static_assert((sizeof(struct refTables) % 4096) == 0, - "refTables not a multiple of page size"); -#else -#define ZFS_LZ4_USE_CACHE -#endif +typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error; -#ifdef ZFS_LZ4_USE_CACHE -static kmem_cache_t *lz4_cache; - -void -lz4_init(void) +LZ4_FORCE_INLINE unsigned +read_variable_length(const BYTE**ip, const BYTE* lencheck, + int loop_check, int initial_check, + variable_length_error* error) { - lz4_cache = kmem_cache_create("lz4_cache", - sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0); + U32 length = 0; + U32 s; + if (initial_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ + *error = initial_error; + return length; + } + do { + s = **ip; + (*ip)++; + length += s; + if (loop_check && unlikely((*ip) >= lencheck)) { /* overflow detection */ + *error = loop_error; + return length; + } + } while (s==255); + + return length; } -void -lz4_fini(void) -{ - if (lz4_cache) { - kmem_cache_destroy(lz4_cache); - lz4_cache = NULL; - } -} +#define LZ4_STATIC_ASSERT(c) ASSERT(c) -static void * -lz4_alloc(int flags) -{ - ASSERT(lz4_cache != NULL); - return (kmem_cache_alloc(lz4_cache, flags)); -} -static void -lz4_free(void *ctx) -{ - kmem_cache_free(lz4_cache, ctx); -} -#else -void -lz4_init(void) +/*! LZ4_decompress_generic() : + * This generic decompression function covers all use cases. + * It shall be instantiated several times, using different sets of directives. + * Note that it is important for performance that this function really get inlined, + * in order to remove useless branches during compilation optimization. + */ +LZ4_FORCE_INLINE int +LZ4_decompress_generic( + const char* const src, + char* const dst, + int srcSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` */ + + endCondition_directive endOnInput, /* endOnOutputSize, endOnInputSize */ + earlyEnd_directive partialDecoding, /* full, partial */ + dict_directive dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ + ) { -} + if ((src == NULL) || (outputSize < 0)) { return -1; } + + { const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + + BYTE* op = (BYTE*) dst; + BYTE* const oend = op + outputSize; + BYTE* cpy; + + const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize; + + const int safeDecode = (endOnInput==endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); + + + /* Set up the "end" pointers for the shortcut. */ + const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/; + const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/; + + const BYTE* match; + size_t offset; + unsigned token; + size_t length; + + + DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize); + + /* Special cases */ + assert(lowPrefix <= op); + if ((endOnInput) && (unlikely(outputSize==0))) { + /* Empty output buffer */ + if (partialDecoding) return 0; + return ((srcSize==1) && (*ip==0)) ? 0 : -1; + } + if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); } + if ((endOnInput) && unlikely(srcSize==0)) { return -1; } + + /* Currently the fast loop shows a regression on qualcomm arm chips. */ +#if LZ4_FAST_DEC_LOOP + if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { + DEBUGLOG(6, "skip fast decode loop"); + goto safe_decode; + } + + /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */ + while (1) { + /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */ + assert(oend - op >= FASTLOOP_SAFE_DISTANCE); + if (endOnInput) { assert(ip < iend); } + token = *ip++; + length = token >> ML_BITS; /* literal length */ + + assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ + + /* decode literal length */ + if (length == RUN_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error); + if (error == initial_error) { goto _output_error; } + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + + /* copy literals */ + cpy = op+length; + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if (endOnInput) { /* LZ4_decompress_safe() */ + if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; } + LZ4_wildCopy32(op, ip, cpy); + } else { /* LZ4_decompress_fast() */ + if (cpy>oend-8) { goto safe_literal_copy; } + LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time : + * it doesn't know input length, and only relies on end-of-block properties */ + } + ip += length; op = cpy; + } else { + cpy = op+length; + if (endOnInput) { /* LZ4_decompress_safe() */ + DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); + /* We don't need to check oend, since we check it once for each loop below */ + if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; } + /* Literals can only be 14, but hope compilers optimize if we copy by a register size */ + LZ4_memcpy(op, ip, 16); + } else { /* LZ4_decompress_fast() */ + /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time : + * it doesn't know input length, and relies on end-of-block properties */ + LZ4_memcpy(op, ip, 8); + if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); } + } + ip += length; op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + assert(match <= op); + + /* get matchlength */ + length = token & ML_MASK; + + if (length == ML_MASK) { + variable_length_error error = ok; + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */ + length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error); + if (error != ok) { goto _output_error; } + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */ + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + } else { + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + + /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */ + if ((dict == withPrefix64k) || (match >= lowPrefix)) { + if (offset >= 8) { + assert(match >= lowPrefix); + assert(match <= op); + assert(op + 18 <= oend); + + LZ4_memcpy(op, match, 8); + LZ4_memcpy(op+8, match+8, 8); + LZ4_memcpy(op+16, match+16, 2); + op += length; + continue; + } } } + + if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */ + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) { + DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd"); + length = MIN(length, (size_t)(oend-op)); + } else { + goto _output_error; /* end-of-block condition violated */ + } } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) { *op++ = *copyFrom++; } + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + + /* copy match within block */ + cpy = op + length; + + assert((op <= oend) && (oend-op >= 32)); + if (unlikely(offset<16)) { + LZ4_memcpy_using_offset(op, match, cpy, offset); + } else { + LZ4_wildCopy32(op, match, cpy); + } + + op = cpy; /* wildcopy correction */ + } + safe_decode: +#endif -void -lz4_fini(void) -{ + /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */ + while (1) { + token = *ip++; + length = token >> ML_BITS; /* literal length */ + + assert(!endOnInput || ip <= iend); /* ip < iend before the increment */ + + /* A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough space, + * enter the shortcut and copy 16 bytes on behalf of the literals + * (in the fast mode, only 8 bytes can be safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes in a similar + * manner; but we ensure that there's enough space in the output for + * those 18 bytes earlier, upon entering the shortcut (in other words, + * there is a combined check for both stages). + */ + if ( (endOnInput ? length != RUN_MASK : length <= 8) + /* strictly "less than" on input, to re-enter the loop with at least one byte */ + && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) { + /* Copy the literals */ + LZ4_memcpy(op, ip, endOnInput ? 16 : 8); + op += length; ip += length; + + /* The second stage: prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. */ + length = token & ML_MASK; /* match length */ + offset = LZ4_readLE16(ip); ip += 2; + match = op - offset; + assert(match <= op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ( (length != ML_MASK) + && (offset >= 8) + && (dict==withPrefix64k || match >= lowPrefix) ) { + /* Copy the match. */ + LZ4_memcpy(op + 0, match + 0, 8); + LZ4_memcpy(op + 8, match + 8, 8); + LZ4_memcpy(op +16, match +16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* The second stage didn't work out, but the info is ready. + * Propel it right to the point of match copying. */ + goto _copy_match; + } + + /* decode literal length */ + if (length == RUN_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error); + if (error == initial_error) { goto _output_error; } + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */ + if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */ + } + + /* copy literals */ + cpy = op+length; +#if LZ4_FAST_DEC_LOOP + safe_literal_copy: +#endif + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) ) + { + /* We've either hit the input parsing restriction or the output parsing restriction. + * In the normal scenario, decoding a full block, it must be the last sequence, + * otherwise it's an error (invalid input or dimensions). + * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow. + */ + if (partialDecoding) { + /* Since we are partial decoding we may be in this block because of the output parsing + * restriction, which is not valid since the output buffer is allowed to be undersized. + */ + assert(endOnInput); + DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end") + DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length); + DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op)); + DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip)); + /* Finishing in the middle of a literals segment, + * due to lack of input. + */ + if (ip+length > iend) { + length = (size_t)(iend-ip); + cpy = op + length; + } + /* Finishing in the middle of a literals segment, + * due to lack of output space. + */ + if (cpy > oend) { + cpy = oend; + assert(op<=oend); + length = (size_t)(oend-op); + } + } else { + /* We must be on the last sequence because of the parsing limitations so check + * that we exactly regenerate the original size (must be exact when !endOnInput). + */ + if ((!endOnInput) && (cpy != oend)) { goto _output_error; } + /* We must be on the last sequence (or invalid) because of the parsing limitations + * so check that we exactly consume the input and don't overrun the output buffer. + */ + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) { + DEBUGLOG(6, "should have been last run of literals") + DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend); + DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend); + goto _output_error; + } + } + memmove(op, ip, length); /* supports overlapping memory regions; only matters for in-place decompression scenarios */ + ip += length; + op += length; + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) { + break; + } + } else { + LZ4_wildCopy8(op, ip, cpy); /* may overwrite up to WILDCOPYLENGTH beyond cpy */ + ip += length; op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); ip+=2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + + _copy_match: + if (length == ML_MASK) { + variable_length_error error = ok; + length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error); + if (error != ok) goto _output_error; + if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error; /* overflow detection */ + } + length += MINMATCH; + +#if LZ4_FAST_DEC_LOOP + safe_match_copy: +#endif + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error; /* Error : offset outside buffers */ + /* match starting within external dictionary */ + if ((dict==usingExtDict) && (match < lowPrefix)) { + if (unlikely(op+length > oend-LASTLITERALS)) { + if (partialDecoding) length = MIN(length, (size_t)(oend-op)); + else goto _output_error; /* doesn't respect parsing restriction */ + } + + if (length <= (size_t)(lowPrefix-match)) { + /* match fits entirely within external dictionary : just copy */ + memmove(op, dictEnd - (lowPrefix-match), length); + op += length; + } else { + /* match stretches into both external dictionary and current block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) *op++ = *copyFrom++; + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } } + continue; + } + assert(match >= lowPrefix); + + /* copy match within block */ + cpy = op + length; + + /* partialDecoding : may end anywhere within the block */ + assert(op<=oend); + if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + size_t const mlen = MIN(length, (size_t)(oend-op)); + const BYTE* const matchEnd = match + mlen; + BYTE* const copyEnd = op + mlen; + if (matchEnd > op) { /* overlap copy */ + while (op < copyEnd) { *op++ = *match++; } + } else { + LZ4_memcpy(op, match, mlen); + } + op = copyEnd; + if (op == oend) { break; } + continue; + } + + if (unlikely(offset<8)) { + LZ4_write32(op, 0); /* silence msan warning when offset==0 */ + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + LZ4_memcpy(op+4, match, 4); + match -= dec64table[offset]; + } else { + LZ4_memcpy(op, match, 8); + match += 8; + } + op += 8; + + if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) { + BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1); + if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */ + if (op < oCopyLimit) { + LZ4_wildCopy8(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) { *op++ = *match++; } + } else { + LZ4_memcpy(op, match, 8); + if (length > 16) { LZ4_wildCopy8(op+8, match+8, cpy); } + } + op = cpy; /* wildcopy correction */ + } + + /* end of decoding */ + if (endOnInput) { + DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst)); + return (int) (((char*)op)-dst); /* Nb of output bytes decoded */ + } else { + return (int) (((const char*)ip)-src); /* Nb of input bytes read */ + } + + /* Overflow error detected */ + _output_error: + return (int) (-(((const char*)ip)-src))-1; + } } -static void * -lz4_alloc(int flags) -{ - return (kmem_alloc(sizeof (struct refTables), flags)); -} +/* + * LZ4_uncompress_unknownOutputSize() : + * isize : is the input size, therefore the compressed size + * maxOutputSize : is the size of the destination buffer (which must be + * already allocated) + * return : the number of bytes decoded in the destination buffer + * (necessarily <= maxOutputSize). If the source stream is + * malformed, the function will stop decoding and return a + * negative result, indicating the byte position of the faulty + * instruction. This function never writes beyond dest + + * maxOutputSize, and is therefore protected against malicious + * data packets. + * note : Destination buffer must be already allocated. + * This version is slightly slower than real_LZ4_uncompress() + * + */ -static void -lz4_free(void *ctx) +/* + * Note: In upstream code, LZ4_uncompress_unknownOutputSize is now a legacy + * wrapper for LZ4_decompress_safe which is a wrapper for + * LZ4_decompress_generic; this wrapper flattens that, rather than + * rewriting the callers. + */ +int LZ4_uncompress_unknownOutputSize(const char* source, char* dest, int compressedSize, int maxDecompressedSize) { - kmem_free(ctx, sizeof (struct refTables)); + return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, + endOnInputSize, decode_full_block, noDict, + (BYTE*)dest, NULL, 0); } -#endif diff --git a/sys/contrib/openzfs/module/zfs/lz4_zfs.c b/sys/contrib/openzfs/module/zfs/lz4_zfs.c new file mode 100644 index 000000000000..820556effb8b --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/lz4_zfs.c @@ -0,0 +1,935 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-2013, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +/* + * N.B. - This file seems to be based on LZ4 r85, dated Dec 10, 2012 + */ + +#include <sys/zfs_context.h> +#include <sys/zio_compress.h> + +static int real_LZ4_compress(const char *source, char *dest, int isize, + int osize); +static int LZ4_compressCtx(void *ctx, const char *source, char *dest, + int isize, int osize); +static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest, + int isize, int osize); + +/* See lz4.c */ +int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, + int isize, int maxOutputSize); + +static void *lz4_alloc(int flags); +static void lz4_free(void *ctx); + +size_t +lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) +{ + (void) n; + uint32_t bufsiz; + char *dest = d_start; + + ASSERT(d_len >= sizeof (bufsiz)); + + bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len, + d_len - sizeof (bufsiz)); + + /* Signal an error if the compression routine returned zero. */ + if (bufsiz == 0) + return (s_len); + + /* + * The exact compressed size is needed by the decompression routine, + * so it is stored at the start of the buffer. Note that this may be + * less than the compressed block size, which is rounded up to a + * multiple of 1<<ashift. + */ + *(uint32_t *)dest = BE_32(bufsiz); + + return (bufsiz + sizeof (bufsiz)); +} + +int +lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) +{ + (void) n; + const char *src = s_start; + uint32_t bufsiz = BE_IN32(src); + + /* invalid compressed buffer size encoded at start */ + if (bufsiz + sizeof (bufsiz) > s_len) + return (1); + + /* + * Returns 0 on success (decompression function returned non-negative) + * and non-zero on failure (decompression function returned negative). + */ + return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], + d_start, bufsiz, d_len) < 0); +} + +/* + * LZ4 API Description: + * + * Simple Functions: + * real_LZ4_compress() : + * isize : is the input size. Max supported value is ~1.9GB + * return : the number of bytes written in buffer dest + * or 0 if the compression fails (if LZ4_COMPRESSMIN is set). + * note : destination buffer must be already allocated. + * destination buffer must be sized to handle worst cases + * situations (input data not compressible) worst case size + * evaluation is provided by function LZ4_compressBound(). + * + * real_LZ4_uncompress() : + * osize : is the output size, therefore the original size + * return : the number of bytes read in the source buffer. + * If the source stream is malformed, the function will stop + * decoding and return a negative result, indicating the byte + * position of the faulty instruction. This function never + * writes beyond dest + osize, and is therefore protected + * against malicious data packets. + * note : destination buffer must be already allocated + * note : real_LZ4_uncompress() is not used in ZFS so its code + * is not present here. + * + * Advanced Functions + * + * LZ4_compressBound() : + * Provides the maximum size that LZ4 may output in a "worst case" + * scenario (input data not compressible) primarily useful for memory + * allocation of output buffer. + * + * isize : is the input size. Max supported value is ~1.9GB + * return : maximum output size in a "worst case" scenario + * note : this function is limited by "int" range (2^31-1) + * + * LZ4_uncompress_unknownOutputSize() : + * isize : is the input size, therefore the compressed size + * maxOutputSize : is the size of the destination buffer (which must be + * already allocated) + * return : the number of bytes decoded in the destination buffer + * (necessarily <= maxOutputSize). If the source stream is + * malformed, the function will stop decoding and return a + * negative result, indicating the byte position of the faulty + * instruction. This function never writes beyond dest + + * maxOutputSize, and is therefore protected against malicious + * data packets. + * note : Destination buffer must be already allocated. + * This version is slightly slower than real_LZ4_uncompress() + * + * LZ4_compressCtx() : + * This function explicitly handles the CTX memory structure. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_cache_alloc). Passing + * NULL isn't valid. + * + * LZ4_compress64kCtx() : + * Same as LZ4_compressCtx(), but specific to small inputs (<64KB). + * isize *Must* be <64KB, otherwise the output will be corrupted. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_cache_alloc). Passing + * NULL isn't valid. + */ + +/* + * Tuning parameters + */ + +/* + * COMPRESSIONLEVEL: Increasing this value improves compression ratio + * Lowering this value reduces memory usage. Reduced memory usage + * typically improves speed, due to cache effect (ex: L1 32KB for Intel, + * L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes + * (examples : 12 -> 16KB ; 17 -> 512KB) + */ +#define COMPRESSIONLEVEL 12 + +/* + * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the + * algorithm skip faster data segments considered "incompressible". + * This may decrease compression ratio dramatically, but will be + * faster on incompressible data. Increasing this value will make + * the algorithm search more before declaring a segment "incompressible". + * This could improve compression a bit, but will be slower on + * incompressible data. The default value (6) is recommended. + */ +#define NOTCOMPRESSIBLE_CONFIRMATION 6 + +/* + * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to + * performance for big endian cpu, but the resulting compressed stream + * will be incompatible with little-endian CPU. You can set this option + * to 1 in situations where data will stay within closed environment. + * This option is useless on Little_Endian CPU (such as x86). + */ +/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ + +/* + * CPU Feature Detection + */ + +/* 32 or 64 bits ? */ +#if defined(_LP64) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Little Endian or Big Endian? + * Note: overwrite the below #define if you know your architecture endianness. + */ +#if defined(_ZFS_BIG_ENDIAN) +#define LZ4_BIG_ENDIAN 1 +#else +/* + * Little Endian assumed. PDP Endian and other very rare endian format + * are unsupported. + */ +#undef LZ4_BIG_ENDIAN +#endif + +/* + * Unaligned memory access is automatically enabled for "common" CPU, + * such as x86. For others CPU, the compiler will be more cautious, and + * insert extra code to ensure aligned access is respected. If you know + * your target CPU supports unaligned memory access, you may want to + * force this option manually to improve performance + */ +#if defined(__ARM_FEATURE_UNALIGNED) +#define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +/* + * Illumos : we can't use GCC's __builtin_ctz family of builtins in the + * kernel + * Linux : we can use GCC's __builtin_ctz family of builtins in the + * kernel + */ +#undef LZ4_FORCE_SW_BITCOUNT +#if defined(__sparc) +#define LZ4_FORCE_SW_BITCOUNT +#endif + +/* + * Compiler Options + */ +/* Disable restrict */ +#define restrict + +/* + * Linux : GCC_VERSION is defined as of 3.9-rc1, so undefine it. + * torvalds/linux@3f3f8d2f48acfd8ed3b8e6b7377935da57b27b16 + */ +#ifdef GCC_VERSION +#undef GCC_VERSION +#endif + +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +#define expect(expr, value) (__builtin_expect((expr), (value))) +#else +#define expect(expr, value) (expr) +#endif + +#ifndef likely +#define likely(expr) expect((expr) != 0, 1) +#endif + +#ifndef unlikely +#define unlikely(expr) expect((expr) != 0, 0) +#endif + +#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ + (((x) & 0xffu) << 8))) + +/* Basic types */ +#define BYTE uint8_t +#define U16 uint16_t +#define U32 uint32_t +#define S32 int32_t +#define U64 uint64_t + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack(1) +#endif + +typedef struct _U16_S { + U16 v; +} U16_S; +typedef struct _U32_S { + U32 v; +} U32_S; +typedef struct _U64_S { + U64 v; +} U64_S; + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack() +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + +/* + * Constants + */ +#define MINMATCH 4 + +#define HASH_LOG COMPRESSIONLEVEL +#define HASHTABLESIZE (1 << HASH_LOG) +#define HASH_MASK (HASHTABLESIZE - 1) + +#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \ + NOTCOMPRESSIBLE_CONFIRMATION : 2) + +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<<ML_BITS)-1) +#define RUN_BITS (8-ML_BITS) +#define RUN_MASK ((1U<<RUN_BITS)-1) + + +/* + * Architecture-specific macros + */ +#if LZ4_ARCH64 +#define STEPSIZE 8 +#define UARCH U64 +#define AARCH A64 +#define LZ4_COPYSTEP(s, d) A64(d) = A64(s); d += 8; s += 8; +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) +#define LZ4_SECURECOPY(s, d, e) if (d < e) LZ4_WILDCOPY(s, d, e) +#define HTYPE U32 +#define INITBASE(base) const BYTE* const base = ip +#else /* !LZ4_ARCH64 */ +#define STEPSIZE 4 +#define UARCH U32 +#define AARCH A32 +#define LZ4_COPYSTEP(s, d) A32(d) = A32(s); d += 4; s += 4; +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d); +#define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const BYTE * +#define INITBASE(base) const int base = 0 +#endif /* !LZ4_ARCH64 */ + +#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE)) +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; } +#define LZ4_WRITE_LITTLEENDIAN_16(p, i) \ + { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; } +#else +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); } +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) { A16(p) = v; p += 2; } +#endif + + +/* Local structures */ +struct refTables { + HTYPE hashTable[HASHTABLESIZE]; +}; + + +/* Macros */ +#define LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \ + HASH_LOG)) +#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) +#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e); +#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \ + d = e; } + + +/* Private functions */ +#if LZ4_ARCH64 + +static inline int +LZ4_NbCommonBytes(register U64 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +#else + int r; + if (!(val >> 32)) { + r = 4; + } else { + r = 0; + val >>= 32; + } + if (!(val >> 16)) { + r += 2; + val >>= 8; + } else { + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +#else + static const int DeBruijnBytePos[64] = + { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, + 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, + 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, + 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 + }; + return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >> + 58]; +#endif +#endif +} + +#else + +static inline int +LZ4_NbCommonBytes(register U32 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +#else + int r; + if (!(val >> 16)) { + r = 2; + val >>= 8; + } else { + r = 0; + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if defined(__GNUC__) && (GCC_VERSION >= 304) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +#else + static const int DeBruijnBytePos[32] = { + 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 + }; + return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> + 27]; +#endif +#endif +} + +#endif + +/* Compression functions */ + +static int +LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ + struct refTables *srt = (struct refTables *)ctx; + HTYPE *HashTable = (HTYPE *) (srt->hashTable); + + const BYTE *ip = (BYTE *) source; + INITBASE(base); + const BYTE *anchor = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardH = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (unlikely(forwardIp > mflimit)) { + goto _last_literals; + } + + forwardH = LZ4_HASH_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while (likely(ip < matchlimit - (STEPSIZE - 1))) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH_VALUE(ip)]; + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) { + *op++ = 255; + } + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + + + +/* Note : this function is valid only if isize < LZ4_64KLIMIT */ +#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1)) +#define HASHLOG64K (HASH_LOG + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \ + HASHLOG64K)) +#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) + +static int +LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ + struct refTables *srt = (struct refTables *)ctx; + U16 *HashTable = (U16 *) (srt->hashTable); + + const BYTE *ip = (BYTE *) source; + const BYTE *anchor = ip; + const BYTE *const base = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + forwardH = LZ4_HASH64K_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while (ip < matchlimit - (STEPSIZE - 1)) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; + HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base; + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) + *op++ = 255; + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + +static int +real_LZ4_compress(const char *source, char *dest, int isize, int osize) +{ + void *ctx; + int result; + + ctx = lz4_alloc(KM_SLEEP); + + /* + * out of kernel memory, gently fall through - this will disable + * compression in zio_compress_data + */ + if (ctx == NULL) + return (0); + + memset(ctx, 0, sizeof (struct refTables)); + + if (isize < LZ4_64KLIMIT) + result = LZ4_compress64kCtx(ctx, source, dest, isize, osize); + else + result = LZ4_compressCtx(ctx, source, dest, isize, osize); + + lz4_free(ctx); + return (result); +} + +#ifdef __FreeBSD__ +/* + * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here. + * Should struct refTables get resized this may need to be revisited, hence + * compiler-time asserts. + */ +_Static_assert(sizeof(struct refTables) <= 16384, + "refTables too big for malloc"); +_Static_assert((sizeof(struct refTables) % 4096) == 0, + "refTables not a multiple of page size"); +#else +#define ZFS_LZ4_USE_CACHE +#endif + +#ifdef ZFS_LZ4_USE_CACHE +static kmem_cache_t *lz4_cache; +#endif + +#ifdef ZFS_LZ4_USE_CACHE +void +lz4_init(void) +{ + lz4_cache = kmem_cache_create("lz4_cache", + sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +lz4_fini(void) +{ + if (lz4_cache) { + kmem_cache_destroy(lz4_cache); + lz4_cache = NULL; + } +} + +static void * +lz4_alloc(int flags) +{ + ASSERT(lz4_cache != NULL); + return (kmem_cache_alloc(lz4_cache, flags)); +} + +static void +lz4_free(void *ctx) +{ + kmem_cache_free(lz4_cache, ctx); +} +#else +void +lz4_init(void) +{ +} + +void +lz4_fini(void) +{ +} + +static void * +lz4_alloc(int flags) +{ + return (kmem_alloc(sizeof (struct refTables), flags)); +} + +static void +lz4_free(void *ctx) +{ + kmem_free(ctx, sizeof (struct refTables)); +} +#endif diff --git a/sys/contrib/openzfs/module/zfs/lzjb.c b/sys/contrib/openzfs/module/zfs/lzjb.c index a478e64c5141..a24f17e0fe74 100644 --- a/sys/contrib/openzfs/module/zfs/lzjb.c +++ b/sys/contrib/openzfs/module/zfs/lzjb.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -45,10 +45,10 @@ #define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) #define LEMPEL_SIZE 1024 -/*ARGSUSED*/ size_t lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { + (void) n; uchar_t *src = s_start; uchar_t *dst = d_start; uchar_t *cpy; @@ -100,10 +100,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) return (dst - (uchar_t *)d_start); } -/*ARGSUSED*/ int lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) { + (void) s_len, (void) n; uchar_t *src = s_start; uchar_t *dst = d_start; uchar_t *d_end = (uchar_t *)d_start + d_len; diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index d1fee70f004b..7170b5eefcea 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -40,23 +40,26 @@ #include <sys/zap.h> #include <sys/btree.h> -#define WITH_DF_BLOCK_ALLOCATOR - #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal - * operation, we will try to write this amount of data to a top-level vdev - * before moving on to the next one. + * operation, we will try to write this amount of data to each disk before + * moving on to the next top-level vdev. */ -unsigned long metaslab_aliquot = 512 << 10; +static uint64_t metaslab_aliquot = 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. */ -unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; +uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; + +/* + * Of blocks of size >= metaslab_force_ganging, actually gang them this often. + */ +uint_t metaslab_force_ganging_pct = 3; /* * In pools where the log space map feature is not enabled we touch @@ -81,7 +84,7 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17); * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ -int zfs_condense_pct = 200; +uint_t zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of @@ -96,7 +99,7 @@ int zfs_condense_pct = 200; * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ -int zfs_metaslab_condense_block_threshold = 4; +static const int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should @@ -111,7 +114,7 @@ int zfs_metaslab_condense_block_threshold = 4; * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ -int zfs_mg_noalloc_threshold = 0; +static uint_t zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their @@ -135,7 +138,7 @@ int zfs_mg_noalloc_threshold = 0; * enough to avoid hitting the speed bump on pools that are being pushed * to the edge. */ -int zfs_mg_fragmentation_threshold = 95; +static uint_t zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation @@ -143,17 +146,17 @@ int zfs_mg_fragmentation_threshold = 95; * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ -int zfs_metaslab_fragmentation_threshold = 70; +static uint_t zfs_metaslab_fragmentation_threshold = 70; /* * When set will load all metaslabs when pool is first opened. */ -int metaslab_debug_load = 0; +int metaslab_debug_load = B_FALSE; /* * When set will prevent metaslabs from being unloaded. */ -int metaslab_debug_unload = 0; +static int metaslab_debug_unload = B_FALSE; /* * Minimum size which forces the dynamic allocator to change @@ -169,7 +172,7 @@ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ -int metaslab_df_free_pct = 4; +uint_t metaslab_df_free_pct = 4; /* * Maximum distance to search forward from the last offset. Without this @@ -184,14 +187,14 @@ int metaslab_df_free_pct = 4; * With the default setting of 16MB this is 16*1024 (with ashift=9) or * 2048 (with ashift=12). */ -int metaslab_df_max_search = 16 * 1024 * 1024; +static uint_t metaslab_df_max_search = 16 * 1024 * 1024; /* * Forces the metaslab_block_picker function to search for at least this many * segments forwards until giving up on finding a segment that the allocation * will fit into. */ -uint32_t metaslab_min_search_count = 100; +static const uint32_t metaslab_min_search_count = 100; /* * If we are not searching forward (due to metaslab_df_max_search, @@ -200,12 +203,7 @@ uint32_t metaslab_min_search_count = 100; * segment. If it is not set, we will use a segment of exactly the requested * size (or larger). */ -int metaslab_df_use_largest_segment = B_FALSE; - -/* - * Percentage of all cpus that can be used by the metaslab taskq. - */ -int metaslab_load_pct = 50; +static int metaslab_df_use_largest_segment = B_FALSE; /* * These tunables control how long a metaslab will remain loaded after the @@ -215,56 +213,56 @@ int metaslab_load_pct = 50; * unloaded sooner. These settings are intended to be generous -- to keep * metaslabs loaded for a long time, reducing the rate of metaslab loading. */ -int metaslab_unload_delay = 32; -int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */ +static uint_t metaslab_unload_delay = 32; +static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */ /* * Max number of metaslabs per group to preload. */ -int metaslab_preload_limit = 10; +uint_t metaslab_preload_limit = 10; /* * Enable/disable preloading of metaslab. */ -int metaslab_preload_enabled = B_TRUE; +static int metaslab_preload_enabled = B_TRUE; /* * Enable/disable fragmentation weighting on metaslabs. */ -int metaslab_fragmentation_factor_enabled = B_TRUE; +static int metaslab_fragmentation_factor_enabled = B_TRUE; /* * Enable/disable lba weighting (i.e. outer tracks are given preference). */ -int metaslab_lba_weighting_enabled = B_TRUE; +static int metaslab_lba_weighting_enabled = B_TRUE; /* * Enable/disable metaslab group biasing. */ -int metaslab_bias_enabled = B_TRUE; +static int metaslab_bias_enabled = B_TRUE; /* * Enable/disable remapping of indirect DVAs to their concrete vdevs. */ -boolean_t zfs_remap_blkptr_enable = B_TRUE; +static const boolean_t zfs_remap_blkptr_enable = B_TRUE; /* * Enable/disable segment-based metaslab selection. */ -int zfs_metaslab_segment_weight_enabled = B_TRUE; +static int zfs_metaslab_segment_weight_enabled = B_TRUE; /* * When using segment-based metaslab selection, we will continue * allocating from the active metaslab until we have exhausted * zfs_metaslab_switch_threshold of its buckets. */ -int zfs_metaslab_switch_threshold = 2; +static int zfs_metaslab_switch_threshold = 2; /* * Internal switch to enable/disable the metaslab allocation tracing * facility. */ -boolean_t metaslab_trace_enabled = B_FALSE; +static const boolean_t metaslab_trace_enabled = B_FALSE; /* * Maximum entries that the metaslab allocation tracing facility will keep @@ -274,32 +272,32 @@ boolean_t metaslab_trace_enabled = B_FALSE; * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ -uint64_t metaslab_trace_max_entries = 5000; +static const uint64_t metaslab_trace_max_entries = 5000; /* * Maximum number of metaslabs per group that can be disabled * simultaneously. */ -int max_disabled_ms = 3; +static const int max_disabled_ms = 3; /* * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. * To avoid 64-bit overflow, don't set above UINT32_MAX. */ -unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ +static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */ /* * Maximum percentage of memory to use on storing loaded metaslabs. If loading * a metaslab would take it over this percentage, the oldest selected metaslab * is automatically unloaded. */ -int zfs_metaslab_mem_limit = 25; +static uint_t zfs_metaslab_mem_limit = 25; /* * Force the per-metaslab range trees to use 64-bit integers to store * segments. Used for debugging purposes. */ -boolean_t zfs_metaslab_force_large_segs = B_FALSE; +static const boolean_t zfs_metaslab_force_large_segs = B_FALSE; /* * By default we only store segments over a certain size in the size-sorted @@ -308,7 +306,7 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE; * improves load and unload times at the cost of causing us to use slightly * larger segments than we would otherwise in some cases. */ -uint32_t metaslab_by_size_min_shift = 14; +static const uint32_t metaslab_by_size_min_shift = 14; /* * If not set, we will first try normal allocation. If that fails then @@ -321,7 +319,7 @@ uint32_t metaslab_by_size_min_shift = 14; * allocation. If that fails we will do a "try hard" gang allocation. If * that fails then we will have a multi-layer gang block. */ -int zfs_metaslab_try_hard_before_gang = B_FALSE; +static int zfs_metaslab_try_hard_before_gang = B_FALSE; /* * When not trying hard, we only consider the best zfs_metaslab_find_max_tries @@ -337,7 +335,7 @@ int zfs_metaslab_try_hard_before_gang = B_FALSE; * subsequent metaslab has ms_max_size >60KB (but fewer segments in this * bucket, and therefore a lower weight). */ -int zfs_metaslab_find_max_tries = 100; +static uint_t zfs_metaslab_find_max_tries = 100; static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); @@ -370,7 +368,7 @@ static metaslab_stats_t metaslab_stats = { atomic_inc_64(&metaslab_stats.stat.value.ui64); -kstat_t *metaslab_ksp; +static kstat_t *metaslab_ksp; void metaslab_stat_init(void) @@ -406,7 +404,7 @@ metaslab_stat_fini(void) * ========================================================================== */ metaslab_class_t * -metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) +metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops) { metaslab_class_t *mc; @@ -629,8 +627,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc) * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ - space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, + 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); @@ -640,8 +638,9 @@ void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; + hrtime_t now = gethrtime(); for (int i = 0; i < multilist_get_num_sublists(ml); i++) { - multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { @@ -658,13 +657,15 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) i--; break; } - mls = multilist_sublist_lock(ml, i); + mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && - gethrtime() > msp->ms_selected_time + - (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { + now > msp->ms_selected_time + + MSEC2NSEC(metaslab_unload_delay_ms) && + (msp->ms_allocator == -1 || + !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* @@ -851,9 +852,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); } - mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, - maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); - return (mg); } @@ -869,7 +867,6 @@ metaslab_group_destroy(metaslab_group_t *mg) */ ASSERT(mg->mg_activation_count <= 0); - taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); @@ -899,7 +896,8 @@ metaslab_group_activate(metaslab_group_t *mg) if (++mg->mg_activation_count <= 0) return; - mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + mg->mg_aliquot = metaslab_aliquot * MAX(1, + vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { @@ -959,7 +957,7 @@ metaslab_group_passivate(metaslab_group_t *mg) * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); - taskq_wait_outstanding(mg->mg_taskq, 0); + taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { @@ -1222,7 +1220,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg) */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, - uint64_t psize, int allocator, int d) + int flags, uint64_t psize, int allocator, int d) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; @@ -1267,6 +1265,15 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, return (B_FALSE); /* + * Some allocations (e.g., those coming from device removal + * where the * allocations are not even counted in the + * metaslab * allocation queues) are allowed to bypass + * the throttle. + */ + if (flags & METASLAB_DONT_THROTTLE) + return (B_TRUE); + + /* * Relax allocation throttling for ditto blocks. Due to * random imbalances in allocation it tends to push copies * to one vdev, that looks a bit better at the moment. @@ -1277,7 +1284,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, /* * If this metaslab group is below its qmax or it's - * the only allocatable metasable group, then attempt + * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) @@ -1332,6 +1339,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { @@ -1342,16 +1350,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { @@ -1362,11 +1369,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } + typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; @@ -1402,11 +1408,17 @@ metaslab_size_tree_full_load(range_tree_t *rt) range_tree_walk(rt, metaslab_size_sorted_add, &arg); } + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, + range_seg32_t, metaslab_rangesize32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, + range_seg64_t, metaslab_rangesize64_compare) + /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. */ -/* ARGSUSED */ static void metaslab_rt_create(range_tree_t *rt, void *arg) { @@ -1415,26 +1427,29 @@ metaslab_rt_create(range_tree_t *rt, void *arg) size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; + bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; + bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } - zfs_btree_create(size_tree, compare, size); + zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } -/* ARGSUSED */ static void metaslab_rt_destroy(range_tree_t *rt, void *arg) { + (void) rt; metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; @@ -1442,7 +1457,6 @@ metaslab_rt_destroy(range_tree_t *rt, void *arg) kmem_free(mrap, sizeof (*mrap)); } -/* ARGSUSED */ static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) { @@ -1450,27 +1464,25 @@ metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) zfs_btree_t *size_tree = mrap->mra_bt; if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < - (1 << mrap->mra_floor_shift)) + (1ULL << mrap->mra_floor_shift)) return; zfs_btree_add(size_tree, rs); } -/* ARGSUSED */ static void metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; - if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_remove(size_tree, rs); } -/* ARGSUSED */ static void metaslab_rt_vacate(range_tree_t *rt, void *arg) { @@ -1482,7 +1494,7 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg) metaslab_rt_create(rt, arg); } -static range_tree_ops_t metaslab_rt_ops = { +static const range_tree_ops_t metaslab_rt_ops = { .rtop_create = metaslab_rt_create, .rtop_destroy = metaslab_rt_destroy, .rtop_add = metaslab_rt_add, @@ -1602,9 +1614,6 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, return (rs); } -#if defined(WITH_DF_BLOCK_ALLOCATOR) || \ - defined(WITH_CF_BLOCK_ALLOCATOR) - /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking @@ -1639,9 +1648,74 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, *cursor = 0; return (-1ULL); } -#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */ -#if defined(WITH_DF_BLOCK_ALLOCATOR) +static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); +static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); +static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); +metaslab_ops_t *metaslab_allocator(spa_t *spa); + +static metaslab_ops_t metaslab_allocators[] = { + { "dynamic", metaslab_df_alloc }, + { "cursor", metaslab_cf_alloc }, + { "new-dynamic", metaslab_ndf_alloc }, +}; + +static int +spa_find_allocator_byname(const char *val) +{ + int a = ARRAY_SIZE(metaslab_allocators) - 1; + if (strcmp("new-dynamic", val) == 0) + return (-1); /* remove when ndf is working */ + for (; a >= 0; a--) { + if (strcmp(val, metaslab_allocators[a].msop_name) == 0) + return (a); + } + return (-1); +} + +void +spa_set_allocator(spa_t *spa, const char *allocator) +{ + int a = spa_find_allocator_byname(allocator); + if (a < 0) a = 0; + spa->spa_active_allocator = a; + zfs_dbgmsg("spa allocator: %s\n", metaslab_allocators[a].msop_name); +} + +int +spa_get_allocator(spa_t *spa) +{ + return (spa->spa_active_allocator); +} + +#if defined(_KERNEL) +int +param_set_active_allocator_common(const char *val) +{ + char *p; + + if (val == NULL) + return (SET_ERROR(EINVAL)); + + if ((p = strchr(val, '\n')) != NULL) + *p = '\0'; + + int a = spa_find_allocator_byname(val); + if (a < 0) + return (SET_ERROR(EINVAL)); + + zfs_active_allocator = metaslab_allocators[a].msop_name; + return (0); +} +#endif + +metaslab_ops_t * +metaslab_allocator(spa_t *spa) +{ + int allocator = spa_get_allocator(spa); + return (&metaslab_allocators[allocator]); +} + /* * ========================================================================== * Dynamic Fit (df) block allocator @@ -1675,7 +1749,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; - int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -1716,14 +1790,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) return (offset); } -static metaslab_ops_t metaslab_df_ops = { - metaslab_df_alloc -}; - -metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; -#endif /* WITH_DF_BLOCK_ALLOCATOR */ - -#if defined(WITH_CF_BLOCK_ALLOCATOR) /* * ========================================================================== * Cursor fit block allocator - @@ -1766,14 +1832,6 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) return (offset); } -static metaslab_ops_t metaslab_cf_ops = { - metaslab_cf_alloc -}; - -metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops; -#endif /* WITH_CF_BLOCK_ALLOCATOR */ - -#if defined(WITH_NDF_BLOCK_ALLOCATOR) /* * ========================================================================== * New dynamic fit allocator - @@ -1830,14 +1888,6 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) return (-1ULL); } -static metaslab_ops_t metaslab_ndf_ops = { - metaslab_ndf_alloc -}; - -metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; -#endif /* WITH_NDF_BLOCK_ALLOCATOR */ - - /* * ========================================================================== * Metaslabs @@ -1962,9 +2012,9 @@ metaslab_aux_histograms_clear(metaslab_t *msp) */ ASSERT(msp->ms_loaded); - bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); + memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); for (int t = 0; t < TXG_DEFER_SIZE; t++) - bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); + memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); } static void @@ -2054,13 +2104,13 @@ metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) */ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; if (defer_allowed) { - bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], + memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, sizeof (msp->ms_synchist)); } else { - bzero(msp->ms_deferhist[hist_index], + memset(msp->ms_deferhist[hist_index], 0, sizeof (msp->ms_deferhist[hist_index])); } - bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); + memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); } /* @@ -2178,19 +2228,19 @@ metaslab_potentially_evict(metaslab_class_t *mc) uint64_t allmem = arc_all_memory(); uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); - int tries = 0; + uint_t tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; tries++) { unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = - multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx); + multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { - VERIFY3P(mls, ==, multilist_sublist_lock( + VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); @@ -2240,6 +2290,8 @@ metaslab_potentially_evict(metaslab_class_t *mc) inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); } } +#else + (void) mc, (void) zfs_metaslab_mem_limit; #endif } @@ -2758,7 +2810,8 @@ metaslab_fini_flush_data(metaslab_t *msp) mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); - spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), + metaslab_unflushed_dirty(msp)); } uint64_t @@ -2857,7 +2910,7 @@ metaslab_fini(metaslab_t *msp) * of the table. Since the fragmentation value is never stored on disk, it * is possible to change these calculations in the future. */ -int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { +static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 100, /* 512B */ 100, /* 1K */ 98, /* 2K */ @@ -3192,6 +3245,15 @@ static boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { /* + * This case will usually but not always get caught by the checks below; + * metaslabs can be loaded by various means, including the trim and + * initialize code. Once that happens, without this check they are + * allocatable even before they finish their first txg sync. + */ + if (unlikely(msp->ms_new)) + return (B_FALSE); + + /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once * set), and we should use the fast check as long as we're not in @@ -3503,10 +3565,8 @@ metaslab_group_preload(metaslab_group_t *mg) avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; - if (spa_shutting_down(spa) || !metaslab_preload_enabled) { - taskq_wait_outstanding(mg->mg_taskq, 0); + if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; - } mutex_enter(&mg->mg_lock); @@ -3526,8 +3586,9 @@ metaslab_group_preload(metaslab_group_t *mg) continue; } - VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, - msp, TQ_SLEEP) != TASKQID_INVALID); + VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, + msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) + != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } @@ -3558,7 +3619,7 @@ metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; - uint64_t vdev_blocksize = 1 << vd->vdev_ashift; + uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); @@ -3736,50 +3797,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) metaslab_flush_update(msp, tx); } -/* - * Called when the metaslab has been flushed (its own spacemap now reflects - * all the contents of the pool-wide spacemap log). Updates the metaslab's - * metadata and any pool-wide related log space map data (e.g. summary, - * obsolete logs, etc..) to reflect that. - */ static void -metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { - metaslab_group_t *mg = msp->ms_group; - spa_t *spa = mg->mg_vd->vdev_spa; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - ASSERT3U(spa_sync_pass(spa), ==, 1); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - /* - * Just because a metaslab got flushed, that doesn't mean that - * it will pass through metaslab_sync_done(). Thus, make sure to - * update ms_synced_length here in case it doesn't. - */ - msp->ms_synced_length = space_map_length(msp->ms_sm); + mutex_enter(&spa->spa_flushed_ms_lock); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, B_TRUE); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); - /* - * We may end up here from metaslab_condense() without the - * feature being active. In that case this is a no-op. - */ - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa, B_TRUE); +} +void +metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); @@ -3787,17 +3843,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); + /* update log space map summary */ + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, + ms_prev_flushed_dirty); + spa_log_summary_add_flushed_metaslab(spa, dirty); + /* cleanup obsolete logs if any */ - uint64_t log_blocks_before = spa_log_sm_nblocks(spa); spa_cleanup_old_sm_logs(spa, tx); - uint64_t log_blocks_after = spa_log_sm_nblocks(spa); - VERIFY3U(log_blocks_after, <=, log_blocks_before); +} - /* update log space map summary */ - uint64_t blocks_gone = log_blocks_before - log_blocks_after; - spa_log_summary_add_flushed_metaslab(spa); - spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); - spa_log_summary_decrement_blkcount(spa, blocks_gone); +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc..) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || + metaslab_unflushed_txg(msp) == 0) + return; + + metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t @@ -4013,23 +4099,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT0(metaslab_allocated_space(msp)); } - if (metaslab_unflushed_txg(msp) == 0 && - spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { - ASSERT(spa_syncing_log_sm(spa) != NULL); - - metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); - spa_log_sm_increment_current_mscount(spa); - spa_log_summary_add_flushed_metaslab(spa); - - ASSERT(msp->ms_sm != NULL); - mutex_enter(&spa->spa_flushed_ms_lock); - avl_add(&spa->spa_metaslabs_by_flushed, msp); - mutex_exit(&spa->spa_flushed_ms_lock); - - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - } - if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); @@ -4077,6 +4146,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + if (metaslab_unflushed_txg(msp) == 0) + metaslab_unflushed_add(msp, tx); + else if (!metaslab_unflushed_dirty(msp)) + metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); @@ -4272,7 +4345,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || + vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } @@ -4502,8 +4576,8 @@ metaslab_trace_fini(zio_alloc_list_t *zal) */ static void -metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, - int allocator) +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, + int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) @@ -4536,8 +4610,8 @@ metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) } void -metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, - int allocator, boolean_t io_complete) +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, + int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) @@ -4554,7 +4628,7 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, } void -metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag, int allocator) { #ifdef ZFS_DEBUG @@ -4580,6 +4654,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); + VERIFY0(msp->ms_new); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -4634,7 +4709,7 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); - int tries = 0; + uint_t tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; @@ -4651,10 +4726,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, } /* - * If the selected metaslab is condensing or disabled, - * skip it. + * If the selected metaslab is condensing or disabled, or + * hasn't gone through a metaslab_sync_done(), then skip it. */ - if (msp->ms_condensing || msp->ms_disabled > 0) + if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) continue; *was_active = msp->ms_allocator != -1; @@ -4721,7 +4796,6 @@ metaslab_active_mask_verify(metaslab_t *msp) } } -/* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, @@ -5034,7 +5108,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, int allocator, boolean_t try_hard) { uint64_t offset; - ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, dva, d, allocator, try_hard); @@ -5073,7 +5146,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; - metaslab_group_t *mg, *fast_mg, *rotor; + metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -5086,7 +5159,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ - if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) { + if (psize >= metaslab_force_ganging && + metaslab_force_ganging_pct > 0 && + (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); @@ -5126,8 +5201,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, if (vd != NULL && vd->vdev_mg != NULL) { mg = vdev_get_mg(vd, mc); - if (flags & METASLAB_HINTBP_AVOID && - mg->mg_next != NULL) + if (flags & METASLAB_HINTBP_AVOID) mg = mg->mg_next; } else { mg = mca->mca_rotor; @@ -5135,15 +5209,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; - } else if (flags & METASLAB_FASTWRITE) { - mg = fast_mg = mca->mca_rotor; - - do { - if (fast_mg->mg_vd->vdev_pending_fastwrite < - mg->mg_vd->vdev_pending_fastwrite) - mg = fast_mg; - } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor); - } else { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; @@ -5184,7 +5249,7 @@ top: */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, - psize, allocator, d); + flags, psize, allocator, d); } if (!allocatable) { @@ -5193,15 +5258,12 @@ top: goto next; } - ASSERT(mg->mg_initialized); - /* - * Avoid writing single-copy data to a failing, + * Avoid writing single-copy data to an unhealthy, * non-redundant vdev, unless we've already tried all * other vdevs. */ - if ((vd->vdev_stat.vs_write_errors > 0 || - vd->vdev_state < VDEV_STATE_HEALTHY) && + if (vd->vdev_state < VDEV_STATE_HEALTHY && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); @@ -5210,7 +5272,7 @@ top: ASSERT(mg->mg_class == mc); - uint64_t asize = vdev_psize_to_asize(vd, psize); + uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* @@ -5269,7 +5331,7 @@ top: mg->mg_bias = 0; } - if ((flags & METASLAB_FASTWRITE) || + if ((flags & METASLAB_ZIL) || atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mca->mca_rotor = mg->mg_next; @@ -5282,11 +5344,6 @@ top: ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); - if (flags & METASLAB_FASTWRITE) { - atomic_add_64(&vd->vdev_pending_fastwrite, - psize); - } - return (0); } next: @@ -5305,7 +5362,7 @@ next: goto top; } - bzero(&dva[d], sizeof (dva_t)); + memset(&dva[d], 0, sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); @@ -5347,11 +5404,11 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, mutex_exit(&msp->ms_lock); } -/* ARGSUSED */ void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { + (void) inner_offset; boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); @@ -5441,8 +5498,9 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; - bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, + uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); + BP_SET_PHYSICAL_BIRTH(bp, physical_birth); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); @@ -5629,8 +5687,7 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - for (int d = 0; d < slots; d++) - zfs_refcount_add(&mca->mca_alloc_slots, zio); + zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } @@ -5644,8 +5701,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - for (int d = 0; d < slots; d++) - zfs_refcount_remove(&mca->mca_alloc_slots, zio); + zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); } static int @@ -5717,11 +5773,11 @@ typedef struct metaslab_claim_cb_arg_t { int mcca_error; } metaslab_claim_cb_arg_t; -/* ARGSUSED */ static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { + (void) inner_offset; metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { @@ -5793,8 +5849,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; - ASSERT(bp->blk_birth == 0); - ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); + ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); + ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -5818,7 +5874,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator, B_FALSE); - bzero(&dva[d], sizeof (dva_t)); + memset(&dva[d], 0, sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); @@ -5848,7 +5904,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); + ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that @@ -5866,7 +5922,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; - if (bp->blk_birth <= spa->spa_checkpoint_txg && + if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint @@ -5924,60 +5980,12 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) return (error); } -void -metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - atomic_add_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - -void -metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) -{ - const dva_t *dva = bp->blk_dva; - int ndvas = BP_GET_NDVAS(bp); - uint64_t psize = BP_GET_PSIZE(bp); - int d; - vdev_t *vd; - - ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(psize > 0); - - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - - for (d = 0; d < ndvas; d++) { - if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) - continue; - ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); - atomic_sub_64(&vd->vdev_pending_fastwrite, psize); - } - - spa_config_exit(spa, SCL_VDEV, FTAG); -} - -/* ARGSUSED */ static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { + (void) inner, (void) arg; + if (vd->vdev_ops == &vdev_indirect_ops) return; @@ -6139,6 +6147,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) mutex_exit(&mg->mg_ms_disabled_lock); } +void +metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) +{ + ms->ms_unflushed_dirty = dirty; +} + static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { @@ -6175,22 +6189,23 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { - spa_t *spa = ms->ms_group->mg_vd->vdev_spa; - - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; - ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } +boolean_t +metaslab_unflushed_dirty(metaslab_t *ms) +{ + return (ms->ms_unflushed_dirty); +} + uint64_t metaslab_unflushed_txg(metaslab_t *ms) { return (ms->ms_unflushed_txg); } -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, "Allocation granularity (a.k.a. stripe size)"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, @@ -6202,29 +6217,33 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, + "Max number of metaslabs per group to preload"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, "Delay in milliseconds after metaslab was last used before unloading"); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be free to make it " "eligible for allocation"); -ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be considered eligible " "for allocations unless all metaslab groups within the metaslab class " "have also crossed this threshold"); -ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT, - ZMOD_RW, "Fragmentation for metaslab to allow allocation"); - -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, + ZMOD_RW, "Use the fragmentation metric to prefer less fragmented metaslabs"); /* END CSTYLED */ +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, + ZMOD_RW, "Fragmentation for metaslab to allow allocation"); + ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, "Prefer metaslabs with lower LBAs"); @@ -6237,23 +6256,32 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW, - "Blocks larger than this size are forced to be gang blocks"); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, + "Blocks larger than this size are sometimes forced to be gang blocks"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, + "Percentage of large blocks that will be forced to be gang blocks"); -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); -ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); -ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZMOD_RW, "Try hard to allocate before ganging"); -ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, + param_set_active_allocator, param_get_charp, ZMOD_RW, + "SPA active allocator"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index f67a4eb22a2d..71122542758d 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -156,7 +156,7 @@ * vary with the I/O load and this observed value is the ub_mmp_delay which is * stored in the uberblock. The minimum allowed value is 100 ms. */ -ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; +uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; /* * Used to control the duration of the activity test on import. Smaller values @@ -186,8 +186,8 @@ uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; */ uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; -char *mmp_tag = "mmp_write_uberblock"; -static void mmp_thread(void *arg); +static const void *const mmp_tag = "mmp_write_uberblock"; +static __attribute__((noreturn)) void mmp_thread(void *arg); void mmp_init(spa_t *spa) @@ -224,7 +224,6 @@ mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr) *mpp = NULL; cv_broadcast(&mmp->mmp_thread_cv); CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */ - thread_exit(); } void @@ -304,8 +303,10 @@ mmp_next_leaf(spa_t *spa) do { leaf = list_next(&spa->spa_leaf_list, leaf); - if (leaf == NULL) + if (leaf == NULL) { leaf = list_head(&spa->spa_leaf_list); + ASSERT3P(leaf, !=, NULL); + } /* * We skip unwritable, offline, detached, and dRAID spare @@ -444,7 +445,7 @@ mmp_write_uberblock(spa_t *spa) uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); + spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " @@ -516,8 +517,9 @@ mmp_write_uberblock(spa_t *spa) zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags); abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); - abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); + abd_zero_off(ub_abd, sizeof (uberblock_t), + VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t)); mmp->mmp_seq++; mmp->mmp_kstat_id++; @@ -537,7 +539,7 @@ mmp_write_uberblock(spa_t *spa) zio_nowait(zio); } -static void +static __attribute__((noreturn)) void mmp_thread(void *arg) { spa_t *spa = (spa_t *)arg; @@ -549,11 +551,11 @@ mmp_thread(void *arg) uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals); hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval; - boolean_t last_spa_suspended = suspended; - boolean_t last_spa_multihost = multihost; - uint64_t last_mmp_interval = mmp_interval; - uint32_t last_mmp_fail_intervals = mmp_fail_intervals; - hrtime_t last_mmp_fail_ns = mmp_fail_ns; + boolean_t last_spa_suspended; + boolean_t last_spa_multihost; + uint64_t last_mmp_interval; + uint32_t last_mmp_fail_intervals; + hrtime_t last_mmp_fail_ns; callb_cpr_t cpr; int skip_wait = 0; @@ -662,12 +664,13 @@ mmp_thread(void *arg) (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu", + "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, - (u_longlong_t)mmp_fail_ns); + (u_longlong_t)mmp_fail_ns, + (u_longlong_t)spa->spa_uberblock.ub_txg); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", @@ -698,6 +701,8 @@ mmp_thread(void *arg) mmp->mmp_zio_root = NULL; mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); + + thread_exit(); } /* @@ -733,7 +738,7 @@ mmp_signal_all_threads(void) /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, - param_set_multihost_interval, param_get_ulong, ZMOD_RW, + param_set_multihost_interval, spl_param_get_u64, ZMOD_RW, "Milliseconds between mmp writes to each leaf"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c index 8bbc9b376ae0..3d3ef86e6839 100644 --- a/sys/contrib/openzfs/module/zfs/multilist.c +++ b/sys/contrib/openzfs/module/zfs/multilist.c @@ -24,7 +24,7 @@ * This overrides the number of sublists in each multilist_t, which defaults * to the number of CPUs in the system (see multilist_create()). */ -int zfs_multilist_num_sublists = 0; +uint_t zfs_multilist_num_sublists = 0; /* * Given the object contained on the list, return a pointer to the @@ -36,6 +36,8 @@ multilist_d2l(multilist_t *ml, void *obj) { return ((multilist_node_t *)((char *)obj + ml->ml_offset)); } +#else +#define multilist_d2l(ml, obj) ((void) sizeof (ml), (void) sizeof (obj), NULL) #endif /* @@ -67,7 +69,7 @@ multilist_d2l(multilist_t *ml, void *obj) */ static void multilist_create_impl(multilist_t *ml, size_t size, size_t offset, - unsigned int num, multilist_sublist_index_func_t *index_func) + uint_t num, multilist_sublist_index_func_t *index_func) { ASSERT3U(size, >, 0); ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); @@ -102,7 +104,7 @@ void multilist_create(multilist_t *ml, size_t size, size_t offset, multilist_sublist_index_func_t *index_func) { - int num_sublists; + uint_t num_sublists; if (zfs_multilist_num_sublists > 0) { num_sublists = zfs_multilist_num_sublists; @@ -275,9 +277,15 @@ multilist_get_random_index(multilist_t *ml) return (random_in_range(ml->ml_num_sublists)); } +void +multilist_sublist_lock(multilist_sublist_t *mls) +{ + mutex_enter(&mls->mls_lock); +} + /* Lock and return the sublist specified at the given index */ multilist_sublist_t * -multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx) { multilist_sublist_t *mls; @@ -292,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) multilist_sublist_t * multilist_sublist_lock_obj(multilist_t *ml, void *obj) { - return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); + return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj))); } void @@ -325,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) list_insert_tail(&mls->mls_list, obj); } +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_after(&mls->mls_list, prev, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_before(&mls->mls_list, next, obj); +} + /* * Move the object one element forward in the list. * @@ -423,7 +447,5 @@ multilist_link_active(multilist_node_t *link) return (list_link_active(link)); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, UINT, ZMOD_RW, "Number of sublists used in each multilist"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/pathname.c b/sys/contrib/openzfs/module/zfs/pathname.c index 84ab7b7e1111..51460d119106 100644 --- a/sys/contrib/openzfs/module/zfs/pathname.c +++ b/sys/contrib/openzfs/module/zfs/pathname.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index 595918e5a742..5174e2c46633 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -78,7 +78,7 @@ static inline void rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) { - ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <, RANGE_SEG_NUM_TYPES); size_t size = 0; switch (rt->rt_type) { case RANGE_SEG32: @@ -91,9 +91,9 @@ rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) size = sizeof (range_seg_gap_t); break; default: - VERIFY(0); + __builtin_unreachable(); } - bcopy(src, dest, size); + memcpy(dest, src, size); } void @@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } +__attribute__((always_inline)) inline static int range_tree_seg32_compare(const void *x1, const void *x2) { @@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg64_compare(const void *x1, const void *x2) { @@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg_gap_compare(const void *x1, const void *x2) { @@ -187,11 +190,18 @@ range_tree_seg_gap_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, + range_tree_seg32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, + range_tree_seg64_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, + range_tree_seg_gap_compare) + range_tree_t * -range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, - uint64_t start, uint64_t shift, - int (*zfs_btree_compare) (const void *, const void *), - uint64_t gap) +range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift, uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); @@ -199,23 +209,27 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = range_tree_seg32_compare; + bt_find = range_tree_seg32_find_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = range_tree_seg64_compare; + bt_find = range_tree_seg64_find_in_buf; break; case RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); compare = range_tree_seg_gap_compare; + bt_find = range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); } - zfs_btree_create(&rt->rt_root, compare, size); + zfs_btree_create(&rt->rt_root, compare, bt_find, size); rt->rt_ops = ops; rt->rt_gap = gap; @@ -223,7 +237,6 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, rt->rt_type = type; rt->rt_start = start; rt->rt_shift = shift; - rt->rt_btree_compare = zfs_btree_compare; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); @@ -232,10 +245,10 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, } range_tree_t * -range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, +range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift) { - return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0)); + return (range_tree_create_gap(ops, type, arg, start, shift, 0)); } void @@ -372,6 +385,7 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * invalid as soon as we do any mutating btree operations. */ rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); + ASSERT3P(rs_after, !=, NULL); rs_set_start_raw(rs_after, rt, before_start); rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; @@ -701,7 +715,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) zfs_btree_clear(&rt->rt_root); } - bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); + memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram)); rt->rt_space = 0; } @@ -741,76 +755,6 @@ range_tree_is_empty(range_tree_t *rt) return (range_tree_space(rt) == 0); } -/* ARGSUSED */ -void -rt_btree_create(range_tree_t *rt, void *arg) -{ - zfs_btree_t *size_tree = arg; - - size_t size; - switch (rt->rt_type) { - case RANGE_SEG32: - size = sizeof (range_seg32_t); - break; - case RANGE_SEG64: - size = sizeof (range_seg64_t); - break; - case RANGE_SEG_GAP: - size = sizeof (range_seg_gap_t); - break; - default: - panic("Invalid range seg type %d", rt->rt_type); - } - zfs_btree_create(size_tree, rt->rt_btree_compare, size); -} - -/* ARGSUSED */ -void -rt_btree_destroy(range_tree_t *rt, void *arg) -{ - zfs_btree_t *size_tree = arg; - ASSERT0(zfs_btree_numnodes(size_tree)); - - zfs_btree_destroy(size_tree); -} - -/* ARGSUSED */ -void -rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - zfs_btree_t *size_tree = arg; - - zfs_btree_add(size_tree, rs); -} - -/* ARGSUSED */ -void -rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - zfs_btree_t *size_tree = arg; - - zfs_btree_remove(size_tree, rs); -} - -/* ARGSUSED */ -void -rt_btree_vacate(range_tree_t *rt, void *arg) -{ - zfs_btree_t *size_tree = arg; - zfs_btree_clear(size_tree); - zfs_btree_destroy(size_tree); - - rt_btree_create(rt, arg); -} - -range_tree_ops_t rt_btree_ops = { - .rtop_create = rt_btree_create, - .rtop_destroy = rt_btree_destroy, - .rtop_add = rt_btree_add, - .rtop_remove = rt_btree_remove, - .rtop_vacate = rt_btree_vacate -}; - /* * Remove any overlapping ranges between the given segment [start, end) * from removefrom. Add non-overlapping leftovers to addto. diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c index 35a379dded69..718bbb34a8d5 100644 --- a/sys/contrib/openzfs/module/zfs/refcount.c +++ b/sys/contrib/openzfs/module/zfs/refcount.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -26,43 +26,50 @@ #include <sys/zfs_context.h> #include <sys/zfs_refcount.h> +#ifdef ZFS_DEBUG /* * Reference count tracking is disabled by default. It's memory requirements * are reasonable, however as implemented it consumes a significant amount of * cpu time. Until its performance is improved it should be manually enabled. */ -int reference_tracking_enable = FALSE; -int reference_history = 3; /* tunable */ +int reference_tracking_enable = B_FALSE; +static uint_t reference_history = 3; /* tunable */ -#ifdef ZFS_DEBUG static kmem_cache_t *reference_cache; -static kmem_cache_t *reference_history_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - - reference_history_cache = kmem_cache_create("reference_history_cache", - sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); - kmem_cache_destroy(reference_history_cache); +} + +static int +zfs_refcount_compare(const void *x1, const void *x2) +{ + const reference_t *r1 = (const reference_t *)x1; + const reference_t *r2 = (const reference_t *)x2; + + int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); + int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); + int cmp = cmp1 ? cmp1 : cmp2; + return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&rc->rc_list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&rc->rc_removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; @@ -86,19 +93,15 @@ void zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; + void *cookie = NULL; ASSERT3U(rc->rc_count, ==, number); - while ((ref = list_head(&rc->rc_list))) { - list_remove(&rc->rc_list, ref); + while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) kmem_cache_free(reference_cache, ref); - } - list_destroy(&rc->rc_list); + avl_destroy(&rc->rc_tree); - while ((ref = list_head(&rc->rc_removed))) { - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, ref->ref_removed); + while ((ref = list_remove_head(&rc->rc_removed))) kmem_cache_free(reference_cache, ref); - } list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } @@ -124,10 +127,10 @@ zfs_refcount_count(zfs_refcount_t *rc) int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref = NULL; + reference_t *ref; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); @@ -136,9 +139,9 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; + ref->ref_search = B_FALSE; mutex_enter(&rc->rc_mtx); - ASSERT3U(rc->rc_count, >=, 0); - list_insert_head(&rc->rc_list, ref); + avl_add(&rc->rc_tree, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -152,51 +155,55 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder) return (zfs_refcount_add_many(rc, 1, holder)); } +void +zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (likely(!rc->rc_tracked)) + (void) zfs_refcount_add_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_add(rc, holder); +} + int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref; + reference_t *ref, s; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } + s.ref_holder = holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder && ref->ref_number == number) { - list_remove(&rc->rc_list, ref); - if (reference_history > 0) { - ref->ref_removed = - kmem_cache_alloc(reference_history_cache, - KM_SLEEP); - list_insert_head(&rc->rc_removed, ref); - rc->rc_removed_count++; - if (rc->rc_removed_count > reference_history) { - ref = list_tail(&rc->rc_removed); - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, - ref->ref_removed); - kmem_cache_free(reference_cache, ref); - rc->rc_removed_count--; - } - } else { - kmem_cache_free(reference_cache, ref); - } - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); + ref = avl_find(&rc->rc_tree, &s, NULL); + if (unlikely(ref == NULL)) { + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); + } + avl_remove(&rc->rc_tree, ref); + if (reference_history > 0) { + list_insert_head(&rc->rc_removed, ref); + if (rc->rc_removed_count >= reference_history) { + ref = list_remove_tail(&rc->rc_removed); + kmem_cache_free(reference_cache, ref); + } else { + rc->rc_removed_count++; } + } else { + kmem_cache_free(reference_cache, ref); } - panic("No such hold %p on refcount %llx", holder, - (u_longlong_t)(uintptr_t)rc); - return (-1); + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); } int64_t @@ -206,33 +213,49 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) } void +zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (likely(!rc->rc_tracked)) + (void) zfs_refcount_remove_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_remove(rc, holder); +} + +void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { - int64_t count, removed_count; - list_t list, removed; + avl_tree_t tree; + list_t removed; + reference_t *ref; + void *cookie = NULL; + uint64_t count; + uint_t removed_count; - list_create(&list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; - list_move_tail(&list, &src->rc_list); + avl_swap(&tree, &src->rc_tree); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; - list_move_tail(&dst->rc_list, &list); + if (avl_is_empty(&dst->rc_tree)) + avl_swap(&dst->rc_tree, &tree); + else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) + avl_add(&dst->rc_tree, ref); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); - list_destroy(&list); + avl_destroy(&tree); list_destroy(&removed); } @@ -240,23 +263,19 @@ void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { - reference_t *ref; - boolean_t found = B_FALSE; + reference_t *ref, s; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return; + s.ref_holder = current_holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == current_holder && - ref->ref_number == number) { - ref->ref_holder = new_holder; - found = B_TRUE; - break; - } - } - ASSERT(found); + ref = avl_find(&rc->rc_tree, &s, NULL); + ASSERT(ref); + ref->ref_holder = new_holder; + avl_update(&rc->rc_tree, ref); mutex_exit(&rc->rc_mtx); } @@ -276,21 +295,23 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (zfs_refcount_count(rc) > 0); + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_TRUE); - } - } + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref && ref->ref_holder == holder; mutex_exit(&rc->rc_mtx); - return (B_FALSE); + return (res); } /* @@ -301,21 +322,23 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder) boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (B_TRUE); mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_FALSE); - } - } + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref == NULL || ref->ref_holder != holder; mutex_exit(&rc->rc_mtx); - return (B_TRUE); + return (res); } EXPORT_SYMBOL(zfs_refcount_create); @@ -327,10 +350,10 @@ EXPORT_SYMBOL(zfs_refcount_remove); EXPORT_SYMBOL(zfs_refcount_held); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, ,reference_tracking_enable, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW, "Track reference holders to refcount_t objects"); -ZFS_MODULE_PARAM(zfs, ,reference_history, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW, "Maximum reference holders being tracked"); /* END CSTYLED */ #endif /* ZFS_DEBUG */ diff --git a/sys/contrib/openzfs/module/zfs/rrwlock.c b/sys/contrib/openzfs/module/zfs/rrwlock.c index d23fc3ad1067..a8c438bb6ebd 100644 --- a/sys/contrib/openzfs/module/zfs/rrwlock.c +++ b/sys/contrib/openzfs/module/zfs/rrwlock.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -77,7 +77,7 @@ uint_t rrw_tsd_key; typedef struct rrw_node { struct rrw_node *rn_next; rrwlock_t *rn_rrl; - void *rn_tag; + const void *rn_tag; } rrw_node_t; static rrw_node_t * @@ -99,7 +99,7 @@ rrn_find(rrwlock_t *rrl) * Add a node to the head of the singly linked list. */ static void -rrn_add(rrwlock_t *rrl, void *tag) +rrn_add(rrwlock_t *rrl, const void *tag) { rrw_node_t *rn; @@ -115,7 +115,7 @@ rrn_add(rrwlock_t *rrl, void *tag) * thread's list and return TRUE; otherwise return FALSE. */ static boolean_t -rrn_find_and_remove(rrwlock_t *rrl, void *tag) +rrn_find_and_remove(rrwlock_t *rrl, const void *tag) { rrw_node_t *rn; rrw_node_t *prev = NULL; @@ -160,7 +160,7 @@ rrw_destroy(rrwlock_t *rrl) } static void -rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) +rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, const void *tag) { mutex_enter(&rrl->rr_lock); #if !defined(ZFS_DEBUG) && defined(_KERNEL) @@ -192,7 +192,7 @@ rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) } void -rrw_enter_read(rrwlock_t *rrl, void *tag) +rrw_enter_read(rrwlock_t *rrl, const void *tag) { rrw_enter_read_impl(rrl, B_FALSE, tag); } @@ -204,7 +204,7 @@ rrw_enter_read(rrwlock_t *rrl, void *tag) * the pending writer does not work, so we have to give an explicit hint here. */ void -rrw_enter_read_prio(rrwlock_t *rrl, void *tag) +rrw_enter_read_prio(rrwlock_t *rrl, const void *tag) { rrw_enter_read_impl(rrl, B_TRUE, tag); } @@ -228,7 +228,7 @@ rrw_enter_write(rrwlock_t *rrl) } void -rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +rrw_enter(rrwlock_t *rrl, krw_t rw, const void *tag) { if (rw == RW_READER) rrw_enter_read(rrl, tag); @@ -237,7 +237,7 @@ rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) } void -rrw_exit(rrwlock_t *rrl, void *tag) +rrw_exit(rrwlock_t *rrl, const void *tag) { mutex_enter(&rrl->rr_lock); #if !defined(ZFS_DEBUG) && defined(_KERNEL) @@ -339,7 +339,7 @@ rrm_destroy(rrmlock_t *rrl) } void -rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) +rrm_enter(rrmlock_t *rrl, krw_t rw, const void *tag) { if (rw == RW_READER) rrm_enter_read(rrl, tag); @@ -358,7 +358,7 @@ rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) #define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) void -rrm_enter_read(rrmlock_t *rrl, void *tag) +rrm_enter_read(rrmlock_t *rrl, const void *tag) { rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); } @@ -373,7 +373,7 @@ rrm_enter_write(rrmlock_t *rrl) } void -rrm_exit(rrmlock_t *rrl, void *tag) +rrm_exit(rrmlock_t *rrl, const void *tag) { int i; diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c index 2604a7513ecf..0ae4c331dd36 100644 --- a/sys/contrib/openzfs/module/zfs/sa.c +++ b/sys/contrib/openzfs/module/zfs/sa.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -23,6 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 RackTop Systems, Inc. */ #include <sys/zfs_context.h> @@ -141,7 +142,7 @@ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx); -arc_byteswap_func_t sa_bswap_table[] = { +static arc_byteswap_func_t sa_bswap_table[] = { byteswap_uint64_array, byteswap_uint32_array, byteswap_uint16_array, @@ -160,7 +161,7 @@ do { \ *(uint64_t *)((uintptr_t)t + 8) = \ *(uint64_t *)((uintptr_t)s + 8); \ } else { \ - bcopy(s, t, l); \ + memcpy(t, s, l); \ } \ } else { \ sa_copy_data(f, s, t, l); \ @@ -178,7 +179,7 @@ do { \ * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will * use this static table. */ -sa_attr_reg_t sa_legacy_attrs[] = { +static const sa_attr_reg_t sa_legacy_attrs[] = { {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, @@ -200,32 +201,32 @@ sa_attr_reg_t sa_legacy_attrs[] = { /* * This is only used for objects of type DMU_OT_ZNODE */ -sa_attr_type_t sa_legacy_zpl_layout[] = { +static const sa_attr_type_t sa_legacy_zpl_layout[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; /* * Special dummy layout used for buffers with no attributes. */ -sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; +static const sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; -static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs); +static const size_t sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs); static kmem_cache_t *sa_cache = NULL; -/*ARGSUSED*/ static int sa_cache_constructor(void *buf, void *unused, int kmflag) { + (void) unused, (void) kmflag; sa_handle_t *hdl = buf; mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } -/*ARGSUSED*/ static void sa_cache_destructor(void *buf, void *unused) { + (void) unused; sa_handle_t *hdl = buf; mutex_destroy(&hdl->sa_lock); } @@ -285,12 +286,11 @@ sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) #define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) static uint64_t -sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) +sa_layout_info_hash(const sa_attr_type_t *attrs, int attr_count) { - int i; uint64_t crc = -1ULL; - for (i = 0; i != attr_count; i++) + for (int i = 0; i != attr_count; i++) crc ^= SA_ATTR_HASH(attrs[i]); return (crc); @@ -370,7 +370,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, if (bulk[i].sa_data) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_addr, bulk[i].sa_data, - bulk[i].sa_size); + MIN(bulk[i].sa_size, bulk[i].sa_length)); } continue; @@ -402,7 +402,7 @@ sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, } static sa_lot_t * -sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, +sa_add_layout_entry(objset_t *os, const sa_attr_type_t *attrs, int attr_count, uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) { sa_os_t *sa = os->os_sa; @@ -415,7 +415,7 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, tb->lot_attr_count = attr_count; tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); - bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + memcpy(tb->lot_attrs, attrs, sizeof (sa_attr_type_t) * attr_count); tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; @@ -512,7 +512,7 @@ static void sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) { if (func == NULL) { - bcopy(datastart, target, buflen); + memcpy(target, datastart, buflen); } else { boolean_t start; int bytes; @@ -524,7 +524,7 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) bytes = 0; while (bytes < buflen) { func(&dataptr, &length, buflen, start, datastart); - bcopy(dataptr, saptr, length); + memcpy(saptr, dataptr, length); saptr = (void *)((caddr_t)saptr + length); bytes += length; start = B_FALSE; @@ -831,7 +831,7 @@ sa_free_attr_table(sa_os_t *sa) } static int -sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) +sa_attr_table_setup(objset_t *os, const sa_attr_reg_t *reg_attrs, int count) { sa_os_t *sa = os->os_sa; uint64_t sa_attr_count = 0; @@ -992,8 +992,8 @@ bail: } int -sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, - sa_attr_type_t **user_table) +sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs, + int count, sa_attr_type_t **user_table) { zap_cursor_t zc; zap_attribute_t za; @@ -1069,8 +1069,8 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, za.za_num_integers); break; } - VERIFY(ddi_strtoull(za.za_name, NULL, 10, - (unsigned long long *)&lot_num) == 0); + VERIFY0(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num)); (void) sa_add_layout_entry(os, lot_attrs, za.za_num_integers, lot_num, @@ -1202,6 +1202,7 @@ sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, uint8_t idx_len; reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; + IMPLY(reg_length == 0, IS_SA_BONUSTYPE(type)); if (reg_length) { attr_length = reg_length; idx_len = 0; @@ -1218,11 +1219,11 @@ sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, } } -/*ARGSUSED*/ static void sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t variable_length, void *userp) { + (void) hdr, (void) length_idx, (void) variable_length; sa_handle_t *hdl = userp; sa_os_t *sa = hdl->sa_os->os_sa; @@ -1309,10 +1310,10 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) return (0); } -/*ARGSUSED*/ static void sa_evict_sync(void *dbu) { + (void) dbu; panic("evicting sa dbuf\n"); } @@ -1450,13 +1451,13 @@ sa_handle_get(objset_t *objset, uint64_t objid, void *userp, } int -sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) +sa_buf_hold(objset_t *objset, uint64_t obj_num, const void *tag, dmu_buf_t **db) { return (dmu_bonus_hold(objset, obj_num, tag, db)); } void -sa_buf_rele(dmu_buf_t *db, void *tag) +sa_buf_rele(dmu_buf_t *db, const void *tag) { dmu_buf_rele(db, tag); } @@ -1665,8 +1666,9 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) &xattr, 8); if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { - bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - scanstamp, AV_SCANSTAMP_SZ); + memcpy(scanstamp, + (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + AV_SCANSTAMP_SZ); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; @@ -1874,7 +1876,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); - bcopy(hdl->sa_bonus->db_data, old_data[0], + memcpy(old_data[0], hdl->sa_bonus->db_data, hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { @@ -1887,7 +1889,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP); - bcopy(hdl->sa_spill->db_data, old_data[1], + memcpy(old_data[1], hdl->sa_spill->db_data, hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; @@ -1917,7 +1919,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, count = bonus_attr_count; hdr = SA_GET_HDR(hdl, SA_BONUS); idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); - for (; k != 2; k++) { + for (; ; k++) { /* * Iterate over each attribute in layout. Fetch the * size of variable-length attributes needing rewrite diff --git a/sys/contrib/openzfs/module/zfs/sha256.c b/sys/contrib/openzfs/module/zfs/sha2_zfs.c index d297768eada5..872b1e53ee66 100644 --- a/sys/contrib/openzfs/module/zfs/sha256.c +++ b/sys/contrib/openzfs/module/zfs/sha2_zfs.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -18,16 +18,14 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ + #include <sys/zfs_context.h> -#include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/sha2.h> #include <sys/abd.h> @@ -41,11 +39,11 @@ sha_incremental(void *buf, size_t size, void *arg) return (0); } -/*ARGSUSED*/ void -abd_checksum_SHA256(abd_t *abd, uint64_t size, +abd_checksum_sha256(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + (void) ctx_template; int ret; SHA2_CTX ctx; zio_cksum_t tmp; @@ -78,11 +76,11 @@ bswap: zcp->zc_word[3] = BE_64(tmp.zc_word[3]); } -/*ARGSUSED*/ void -abd_checksum_SHA512_native(abd_t *abd, uint64_t size, +abd_checksum_sha512_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + (void) ctx_template; SHA2_CTX ctx; SHA2Init(SHA512_256, &ctx); @@ -90,14 +88,13 @@ abd_checksum_SHA512_native(abd_t *abd, uint64_t size, SHA2Final(zcp, &ctx); } -/*ARGSUSED*/ void -abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, +abd_checksum_sha512_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); + abd_checksum_sha512_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); diff --git a/sys/contrib/openzfs/module/zfs/skein_zfs.c b/sys/contrib/openzfs/module/zfs/skein_zfs.c index 11b9940e027e..4b2aca888eee 100644 --- a/sys/contrib/openzfs/module/zfs/skein_zfs.c +++ b/sys/contrib/openzfs/module/zfs/skein_zfs.c @@ -41,18 +41,17 @@ skein_incremental(void *buf, size_t size, void *arg) * function requires the presence of a ctx_template that should be allocated * using abd_checksum_skein_tmpl_init. */ -/*ARGSUSED*/ void abd_checksum_skein_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { - Skein_512_Ctxt_t ctx; + Skein_512_Ctxt_t ctx; ASSERT(ctx_template != NULL); - bcopy(ctx_template, &ctx, sizeof (ctx)); + memcpy(&ctx, ctx_template, sizeof (ctx)); (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx); (void) Skein_512_Final(&ctx, (uint8_t *)zcp); - bzero(&ctx, sizeof (ctx)); + memset(&ctx, 0, sizeof (ctx)); } /* @@ -80,9 +79,8 @@ abd_checksum_skein_byteswap(abd_t *abd, uint64_t size, void * abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) { - Skein_512_Ctxt_t *ctx; + Skein_512_Ctxt_t *ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); - ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0, salt->zcs_bytes, sizeof (salt->zcs_bytes)); return (ctx); @@ -95,8 +93,8 @@ abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) void abd_checksum_skein_tmpl_free(void *ctx_template) { - Skein_512_Ctxt_t *ctx = ctx_template; + Skein_512_Ctxt_t *ctx = ctx_template; - bzero(ctx, sizeof (*ctx)); + memset(ctx, 0, sizeof (*ctx)); kmem_free(ctx, sizeof (*ctx)); } diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 7546e3e414f1..638572996c3a 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -33,6 +33,8 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024, Klara Inc. */ /* @@ -52,6 +54,7 @@ #include <sys/dmu_tx.h> #include <sys/zap.h> #include <sys/zil.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/vdev_impl.h> #include <sys/vdev_removal.h> @@ -61,6 +64,7 @@ #include <sys/vdev_rebuild.h> #include <sys/vdev_trim.h> #include <sys/vdev_disk.h> +#include <sys/vdev_raidz.h> #include <sys/vdev_draid.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -81,7 +85,6 @@ #include <sys/arc.h> #include <sys/callb.h> #include <sys/systeminfo.h> -#include <sys/spa_boot.h> #include <sys/zfs_ioctl.h> #include <sys/dsl_scan.h> #include <sys/zfeature.h> @@ -98,6 +101,27 @@ #include "zfs_prop.h" #include "zfs_comutil.h" +#include <cityhash.h> + +/* + * spa_thread() existed on Illumos as a parent thread for the various worker + * threads that actually run the pool, as a way to both reference the entire + * pool work as a single object, and to share properties like scheduling + * options. It has not yet been adapted to Linux or FreeBSD. This define is + * used to mark related parts of the code to make things easier for the reader, + * and to compile this code out. It can be removed when someone implements it, + * moves it to some Illumos-specific place, or removes it entirely. + */ +#undef HAVE_SPA_THREAD + +/* + * The "System Duty Cycle" scheduling class is an Illumos feature to help + * prevent CPU-intensive kernel threads from affecting latency on interactive + * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is + * gated behind a define. On Illumos SDC depends on spa_thread(), but + * spa_thread() also has other uses, so this is a separate define. + */ +#undef HAVE_SYSDC /* * The interval, in seconds, at which failed configuration cache file writes @@ -107,16 +131,16 @@ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ - ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ + ZTI_MODE_SYNC, /* sync thread assigned */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } -#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } +#define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) @@ -137,42 +161,60 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE - * macros. Other operations process a large amount of data; the ZTI_BATCH + * macros. Other operations process a large amount of data; the ZTI_SCALE * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the - * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, - * but with number of taskqs also scaling with number of CPUs. + * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs + * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue - * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that - * need to be handled with minimum delay. + * and interrupt) and then to reserve threads for high priority I/Os that + * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT + * implementation, so separate high priority threads are used there. */ -const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ - { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ +#ifdef illumos + { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ +#else + { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ +#endif { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); -static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); +static int spa_load_impl(spa_t *spa, spa_import_type_t type, + const char **ereport); static void spa_vdev_resilver_done(spa_t *spa); -uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ -uint_t zio_taskq_batch_tpq; /* threads per taskq */ -boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ -uint_t zio_taskq_basedc = 80; /* base duty cycle */ +/* + * Percentage of all CPUs that can be used by the metaslab preload taskq. + */ +static uint_t metaslab_preload_pct = 50; + +static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ +static uint_t zio_taskq_batch_tpq; /* threads per taskq */ + +#ifdef HAVE_SYSDC +static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ +static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ +#endif + +#ifdef HAVE_SPA_THREAD +static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ +#endif -boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ +static uint_t zio_taskq_write_tpq = 16; /* * Report any spa_load_verify errors found, but do not fail spa_load. @@ -195,7 +237,7 @@ boolean_t spa_mode_readable_spacemaps = B_FALSE; /* * For debugging purposes: print out vdev tree during pool import. */ -int spa_load_print_vdev_tree = B_FALSE; +static int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing @@ -218,7 +260,7 @@ int spa_load_print_vdev_tree = B_FALSE; * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ -unsigned long zfs_max_missing_tvds = 0; +uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only @@ -244,28 +286,28 @@ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ -boolean_t zfs_pause_spa_sync = B_FALSE; +static const boolean_t zfs_pause_spa_sync = B_FALSE; /* * Variables to indicate the livelist condense zthr func should wait at certain * points for the livelist to be removed - used to test condense/destroy races */ -int zfs_livelist_condense_zthr_pause = 0; -int zfs_livelist_condense_sync_pause = 0; +static int zfs_livelist_condense_zthr_pause = 0; +static int zfs_livelist_condense_sync_pause = 0; /* * Variables to track whether or not condense cancellation has been * triggered in testing. */ -int zfs_livelist_condense_sync_cancel = 0; -int zfs_livelist_condense_zthr_cancel = 0; +static int zfs_livelist_condense_sync_cancel = 0; +static int zfs_livelist_condense_zthr_cancel = 0; /* * Variable to track whether or not extra ALLOC blkptrs were added to a * livelist entry while it was being condensed (caused by the way we track * remapped blkptrs in dbuf_remap_impl) */ -int zfs_livelist_condense_new_alloc = 0; +static int zfs_livelist_condense_new_alloc = 0; /* * ========================================================================== @@ -277,7 +319,7 @@ int zfs_livelist_condense_new_alloc = 0; * Add a (source=src, propname=propval) list to an nvlist. */ static void -spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, +spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); @@ -296,6 +338,22 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, } /* + * Add a user property (source=src, propname=propval) to an nvlist. + */ +static void +spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, + zprop_source_t src) +{ + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + nvlist_free(propval); +} + +/* * Get property values from the spa configuration. */ static void @@ -341,6 +399,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, + brt_get_used(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, + brt_get_saved(spa), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, + brt_get_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); @@ -464,7 +528,8 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; - if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) + if ((prop = zpool_name_to_prop(za.za_name)) == + ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) continue; switch (za.za_integer_length) { @@ -507,7 +572,13 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) kmem_free(strval, za.za_num_integers); break; } - spa_prop_add_list(*nvp, prop, strval, 0, src); + if (prop != ZPOOL_PROP_INVAL) { + spa_prop_add_list(*nvp, prop, strval, 0, src); + } else { + src = ZPROP_SRC_LOCAL; + spa_prop_add_user(*nvp, za.za_name, strval, + src); + } kmem_free(strval, za.za_num_integers); break; @@ -543,42 +614,53 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; - char *strval, *slash, *check, *fname; + const char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: - if (!zpool_prop_feature(propname)) { - error = SET_ERROR(EINVAL); - break; - } - /* * Sanitize the input. */ - if (nvpair_type(elem) != DATA_TYPE_UINT64) { - error = SET_ERROR(EINVAL); - break; - } + if (zfs_prop_user(propname)) { + if (strlen(propname) >= ZAP_MAXNAMELEN) { + error = SET_ERROR(ENAMETOOLONG); + break; + } - if (nvpair_value_uint64(elem, &intval) != 0) { - error = SET_ERROR(EINVAL); - break; - } + if (strlen(fnvpair_value_string(elem)) >= + ZAP_MAXVALUELEN) { + error = SET_ERROR(E2BIG); + break; + } + } else if (zpool_prop_feature(propname)) { + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = SET_ERROR(EINVAL); + break; + } - if (intval != 0) { - error = SET_ERROR(EINVAL); - break; - } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + if (intval != 0) { + error = SET_ERROR(EINVAL); + break; + } + + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = SET_ERROR(EINVAL); + break; + } - fname = strchr(propname, '@') + 1; - if (zfeature_lookup_name(fname, NULL) != 0) { + has_feature = B_TRUE; + } else { error = SET_ERROR(EINVAL); break; } - - has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: @@ -745,7 +827,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { - char *cachefile; + const char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), @@ -785,8 +867,14 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) prop == ZPOOL_PROP_READONLY) continue; + if (prop == ZPOOL_PROP_INVAL && + zfs_prop_user(nvpair_name(elem))) { + need_sync = B_TRUE; + break; + } + if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { - uint64_t ver; + uint64_t ver = 0; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); @@ -840,7 +928,6 @@ spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) } } -/*ARGSUSED*/ static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { @@ -910,7 +997,16 @@ spa_change_guid(spa_t *spa) spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { - spa_write_cachefile(spa, B_FALSE, B_TRUE); + /* + * Clear the kobj flag from all the vdevs to allow + * vdev_cache_process_kobj_evt() to post events to all the + * vdevs since GUID is updated. + */ + vdev_clear_kobj_evt(spa->spa_root_vdev); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); + + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } @@ -948,8 +1044,8 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); - bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); - bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); + memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); + memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), @@ -968,17 +1064,33 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; - boolean_t batch = B_FALSE; switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; - case ZTI_MODE_BATCH: - batch = B_TRUE; + case ZTI_MODE_SYNC: + + /* + * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, + * not to exceed the number of spa allocators, and align to it. + */ + cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); + count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); + count = MAX(count, (zio_taskq_batch_pct + 99) / 100); + count = MIN(count, spa->spa_alloc_count); + while (spa->spa_alloc_count % count != 0 && + spa->spa_alloc_count < count * 2) + count--; + + /* + * zio_taskq_batch_pct is unbounded and may exceed 100%, but no + * single taskq may have more threads than 100% of online cpus. + */ + value = (zio_taskq_batch_pct + count / 2) / count; + value = MIN(value, 100); flags |= TASKQ_THREADS_CPU_PCT; - value = MIN(zio_taskq_batch_pct, 100); break; case ZTI_MODE_SCALE: @@ -1025,7 +1137,7 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " - "spa_activate()", + "spa_taskqs_init()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } @@ -1045,13 +1157,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); +#ifdef HAVE_SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; - + (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { +#endif pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU @@ -1077,7 +1189,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); +#ifdef HAVE_SYSDC } +#endif tqs->stqs_taskq[i] = tq; } @@ -1102,54 +1216,309 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tqs->stqs_taskq = NULL; } +#ifdef _KERNEL /* - * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. - * Note that a type may have multiple discrete taskqs to avoid lock contention - * on the taskq itself. In that case we choose which taskq at random by using - * the low bits of gethrtime(). + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + * zio_taskq_read='fixed,1,8 null scale null' + * zio_taskq_write='sync null scale null' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. */ -void -spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +static const char *const modes[ZTI_NMODES] = { + "fixed", "scale", "sync", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) { - spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - taskq_t *tq; + int err = 0; - ASSERT3P(tqs->stqs_taskq, !=, NULL); - ASSERT3U(tqs->stqs_count, !=, 0); + zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; - if (tqs->stqs_count == 1) { - tq = tqs->stqs_taskq[0]; - } else { - tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; + char *next = cfg, *tok, *c; + + /* + * Parse out each element from the string and fill `row`. The entire + * row has to be set at once, so any errors are flagged by just + * breaking out of this loop early. + */ + uint_t q; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + /* `next` is the start of the config */ + if (next == NULL) + break; + + /* Eat up leading space */ + while (isspace(*next)) + next++; + if (*next == '\0') + break; + + /* Mode ends at space or end of string */ + tok = next; + next = strchr(tok, ' '); + if (next != NULL) *next++ = '\0'; + + /* Parameters start after a comma */ + c = strchr(tok, ','); + if (c != NULL) *c++ = '\0'; + + /* Match mode string */ + uint_t mode; + for (mode = 0; mode < ZTI_NMODES; mode++) + if (strcmp(tok, modes[mode]) == 0) + break; + if (mode == ZTI_NMODES) + break; + + /* Invalid canary */ + row[q].zti_mode = ZTI_NMODES; + + /* Per-mode setup */ + switch (mode) { + + /* + * FIXED is parameterised: number of queues, and number of + * threads per queue. + */ + case ZTI_MODE_FIXED: { + /* No parameters? */ + if (c == NULL || *c == '\0') + break; + + /* Find next parameter */ + tok = c; + c = strchr(tok, ','); + if (c == NULL) + break; + + /* Take digits and convert */ + unsigned long long nq; + if (!(isdigit(*tok))) + break; + err = ddi_strtoull(tok, &tok, 10, &nq); + /* Must succeed and also end at the next param sep */ + if (err != 0 || tok != c) + break; + + /* Move past the comma */ + tok++; + /* Need another number */ + if (!(isdigit(*tok))) + break; + /* Remember start to make sure we moved */ + c = tok; + + /* Take digits */ + unsigned long long ntpq; + err = ddi_strtoull(tok, &tok, 10, &ntpq); + /* Must succeed, and moved forward */ + if (err != 0 || tok == c || *tok != '\0') + break; + + /* + * sanity; zero queues/threads make no sense, and + * 16K is almost certainly more than anyone will ever + * need and avoids silly numbers like UINT32_MAX + */ + if (nq == 0 || nq >= 16384 || + ntpq == 0 || ntpq >= 16384) + break; + + const zio_taskq_info_t zti = ZTI_P(ntpq, nq); + row[q] = zti; + break; + } + + case ZTI_MODE_SCALE: { + const zio_taskq_info_t zti = ZTI_SCALE; + row[q] = zti; + break; + } + + case ZTI_MODE_SYNC: { + const zio_taskq_info_t zti = ZTI_SYNC; + row[q] = zti; + break; + } + + case ZTI_MODE_NULL: { + /* + * Can only null the high-priority queues; the general- + * purpose ones have to exist. + */ + if (q != ZIO_TASKQ_ISSUE_HIGH && + q != ZIO_TASKQ_INTERRUPT_HIGH) + break; + + const zio_taskq_info_t zti = ZTI_NULL; + row[q] = zti; + break; + } + + default: + break; + } + + /* Ensure we set a mode */ + if (row[q].zti_mode == ZTI_NMODES) + break; } - taskq_dispatch_ent(tq, func, arg, flags, ent); + /* Didn't get a full row, fail */ + if (q < ZIO_TASKQ_TYPES) + return (SET_ERROR(EINVAL)); + + /* Eat trailing space */ + if (next != NULL) + while (isspace(*next)) + next++; + + /* If there's anything left over then fail */ + if (next != NULL && *next != '\0') + return (SET_ERROR(EINVAL)); + + /* Success! Copy it into the real config */ + for (q = 0; q < ZIO_TASKQ_TYPES; q++) + zio_taskqs[t][q] = row[q]; + + return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) +{ + int pos = 0; + + /* Build paramater string from live config */ + const char *sep = ""; + for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *zti = &zio_taskqs[t][q]; + if (zti->zti_mode == ZTI_MODE_FIXED) + pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, + modes[zti->zti_mode], zti->zti_count, + zti->zti_value); + else + pos += sprintf(&buf[pos], "%s%s", sep, + modes[zti->zti_mode]); + sep = " "; + } + + if (add_newline) + buf[pos++] = '\n'; + buf[pos] = '\0'; + + return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); } +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); +} +#else /* - * Same as spa_taskq_dispatch_ent() but block on the task until completion. + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define SPA_TASKQ_PARAM_MAX (128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err; + + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err || req->newptr == NULL) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ + +/* + * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. + * Note that a type may have multiple discrete taskqs to avoid lock contention + * on the taskq itself. */ void -spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags) +spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, zio_t *zio, boolean_t cutinline) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; - taskqid_t id; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ + ASSERT(zio); + ASSERT(taskq_empty_ent(&zio->io_tqent)); + if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; + } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + ZIO_HAS_ALLOCATOR(zio)) { + tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } - id = taskq_dispatch(tq, func, arg, flags); - if (id) - taskq_wait_id(tq, id); + taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, + &zio->io_tqent); } static void @@ -1162,11 +1531,6 @@ spa_create_zio_taskqs(spa_t *spa) } } -/* - * Disabled until spa_thread() can be adapted for Linux. - */ -#undef HAVE_SPA_THREAD - #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) @@ -1207,9 +1571,11 @@ spa_thread(void *arg) pool_unlock(); } +#ifdef HAVE_SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } +#endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; @@ -1238,24 +1604,26 @@ spa_thread(void *arg) } #endif +extern metaslab_ops_t *metaslab_allocator(spa_t *spa); + /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, spa_mode_t mode) { + metaslab_ops_t *msp = metaslab_allocator(spa); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_read_spacemaps = spa_mode_readable_spacemaps; - spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); - spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); - spa->spa_embedded_log_class = - metaslab_class_create(spa, zfs_metaslab_ops); - spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); - spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_normal_class = metaslab_class_create(spa, msp); + spa->spa_log_class = metaslab_class_create(spa, msp); + spa->spa_embedded_log_class = metaslab_class_create(spa, msp); + spa->spa_special_class = metaslab_class_create(spa, msp); + spa->spa_dedup_class = metaslab_class_create(spa, msp); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); @@ -1313,6 +1681,11 @@ spa_activate(spa_t *spa, spa_mode_t mode) avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_healed, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + + spa_activate_os(spa); spa_keystore_init(&spa->spa_keystore); @@ -1335,6 +1708,13 @@ spa_activate(spa_t *spa, spa_mode_t mode) 1, INT_MAX, 0); /* + * The taskq to preload metaslabs. + */ + spa->spa_metaslab_taskq = taskq_create("z_metaslab", + metaslab_preload_pct, maxclsyspri, 1, INT_MAX, + TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); + + /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. @@ -1369,6 +1749,11 @@ spa_deactivate(spa_t *spa) spa->spa_zvol_taskq = NULL; } + if (spa->spa_metaslab_taskq) { + taskq_destroy(spa->spa_metaslab_taskq); + spa->spa_metaslab_taskq = NULL; + } + if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; @@ -1421,6 +1806,7 @@ spa_deactivate(spa_t *spa) spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); + avl_destroy(&spa->spa_errlist_healed); spa_keystore_fini(&spa->spa_keystore); @@ -1450,6 +1836,9 @@ spa_deactivate(spa_t *spa) thread_join(spa->spa_did); spa->spa_did = 0; } + + spa_deactivate_os(spa); + } /* @@ -1542,16 +1931,16 @@ spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; + log_summary_entry_t *e; + while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { VERIFY0(e->lse_mscount); - list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } @@ -1579,6 +1968,10 @@ spa_destroy_aux_threads(spa_t *spa) zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } + if (spa->spa_raidz_expand_zthr != NULL) { + zthr_destroy(spa->spa_raidz_expand_zthr); + spa->spa_raidz_expand_zthr = NULL; + } } /* @@ -1587,7 +1980,8 @@ spa_destroy_aux_threads(spa_t *spa) static void spa_unload(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); @@ -1596,25 +1990,33 @@ spa_unload(spa_t *spa) spa_wake_waiters(spa); /* - * If the log space map feature is enabled and the pool is getting - * exported (but not destroyed), we want to spend some time flushing - * as many metaslabs as we can in an attempt to destroy log space - * maps and save import time. + * If we have set the spa_final_txg, we have already performed the + * tasks below in spa_export_common(). We should not redo it here since + * we delay the final TXGs beyond what spa_final_txg is set at. */ - if (spa_should_flush_logs_on_unload(spa)) - spa_unload_log_sm_flush_all(spa); + if (spa->spa_final_txg == UINT64_MAX) { + /* + * If the log space map feature is enabled and the pool is + * getting exported (but not destroyed), we want to spend some + * time flushing as many metaslabs as we can in an attempt to + * destroy log space maps and save import time. + */ + if (spa_should_flush_logs_on_unload(spa)) + spa_unload_log_sm_flush_all(spa); - /* - * Stop async tasks. - */ - spa_async_suspend(spa); + /* + * Stop async tasks. + */ + spa_async_suspend(spa); - if (spa->spa_root_vdev) { - vdev_t *root_vdev = spa->spa_root_vdev; - vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); - vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); - vdev_autotrim_stop_all(spa); - vdev_rebuild_stop_all(spa); + if (spa->spa_root_vdev) { + vdev_t *root_vdev = spa->spa_root_vdev; + vdev_initialize_stop_all(root_vdev, + VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); + } } /* @@ -1629,13 +2031,7 @@ spa_unload(spa_t *spa) * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ - if (spa->spa_root_vdev != NULL) { - for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { - vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; - if (vc->vdev_mg != NULL) - taskq_wait(vc->vdev_mg->mg_taskq); - } - } + taskq_wait(spa->spa_metaslab_taskq); if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); @@ -1680,6 +2076,7 @@ spa_unload(spa_t *spa) } ddt_unload(spa); + brt_unload(spa); spa_unload_log_sm_metadata(spa); /* @@ -1687,9 +2084,9 @@ spa_unload(spa_t *spa) */ spa_l2cache_drop(spa); - for (int i = 0; i < spa->spa_spares.sav_count; i++) - vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { + for (int i = 0; i < spa->spa_spares.sav_count; i++) + vdev_free(spa->spa_spares.sav_vdevs[i]); kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; @@ -1700,11 +2097,11 @@ spa_unload(spa_t *spa) } spa->spa_spares.sav_count = 0; - for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { - vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); - vdev_free(spa->spa_l2cache.sav_vdevs[i]); - } if (spa->spa_l2cache.sav_vdevs) { + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); + vdev_free(spa->spa_l2cache.sav_vdevs[i]); + } kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; @@ -1728,6 +2125,8 @@ spa_unload(spa_t *spa) spa->spa_compatibility = NULL; } + spa->spa_raidz_expand = NULL; + spa_config_exit(spa, SCL_ALL, spa); } @@ -1762,20 +2161,21 @@ spa_load_spares(spa_t *spa) /* * First, close and free any existing spare vdevs. */ - for (i = 0; i < spa->spa_spares.sav_count; i++) { - vd = spa->spa_spares.sav_vdevs[i]; + if (spa->spa_spares.sav_vdevs) { + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; - /* Undo the call to spa_activate() below */ - if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, - B_FALSE)) != NULL && tvd->vdev_isspare) - spa_spare_remove(tvd); - vdev_close(vd); - vdev_free(vd); - } + /* Undo the call to spa_activate() below */ + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL && tvd->vdev_isspare) + spa_spare_remove(tvd); + vdev_close(vd); + vdev_free(vd); + } - if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); + } if (spa->spa_spares.sav_config == NULL) nspares = 0; @@ -1851,7 +2251,8 @@ spa_load_spares(spa_t *spa) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count); + ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, + spa->spa_spares.sav_count); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); @@ -1978,30 +2379,31 @@ spa_load_l2cache(spa_t *spa) for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); - fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, - sav->sav_count); + fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + (const nvlist_t * const *)l2cache, sav->sav_count); out: /* * Purge vdevs that were dropped */ - for (i = 0; i < oldnvdevs; i++) { - uint64_t pool; + if (oldvdevs) { + for (i = 0; i < oldnvdevs; i++) { + uint64_t pool; - vd = oldvdevs[i]; - if (vd != NULL) { - ASSERT(vd->vdev_isl2cache); + vd = oldvdevs[i]; + if (vd != NULL) { + ASSERT(vd->vdev_isl2cache); - if (spa_l2cache_exists(vd->vdev_guid, &pool) && - pool != 0ULL && l2arc_vdev_present(vd)) - l2arc_remove_vdev(vd); - vdev_clear_stats(vd); - vdev_free(vd); + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) + l2arc_remove_vdev(vd); + vdev_clear_stats(vd); + vdev_free(vd); + } } - } - if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); + } for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); @@ -2107,8 +2509,8 @@ spa_check_for_missing_logs(spa_t *spa) } if (idx > 0) { - fnvlist_add_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, child, idx); + fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + (const nvlist_t * const *)child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); @@ -2243,12 +2645,13 @@ spa_claim_notify(zio_t *zio) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ - if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) - spa->spa_claim_max_txg = zio->io_bp->blk_birth; + if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) + spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { + boolean_t sle_verify_data; uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; @@ -2281,18 +2684,19 @@ spa_load_verify_done(zio_t *zio) * Maximum number of inflight bytes is the log2 fraction of the arc size. * By default, we set it to 1/16th of the arc. */ -int spa_load_verify_shift = 4; -int spa_load_verify_metadata = B_TRUE; -int spa_load_verify_data = B_TRUE; +static uint_t spa_load_verify_shift = 4; +static int spa_load_verify_metadata = B_TRUE; +static int spa_load_verify_data = B_TRUE; -/*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || - BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) - return (0); + zio_t *rio = arg; + spa_load_error_t *sle = rio->io_private; + + (void) zilog, (void) dnp; + /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful @@ -2300,12 +2704,28 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (!spa_load_verify_metadata) return (0); - if (!BP_IS_METADATA(bp) && !spa_load_verify_data) + + /* + * Sanity check the block pointer in order to detect obvious damage + * before using the contents in subsequent checks or in zio_read(). + * When damaged consider it to be a metadata error since we cannot + * trust the BP_GET_TYPE and BP_GET_LEVEL values. + */ + if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + atomic_inc_64(&sle->sle_meta_count); + return (0); + } + + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) + return (0); + + if (!BP_IS_METADATA(bp) && + (!spa_load_verify_data || !sle->sle_verify_data)) return (0); uint64_t maxinflight_bytes = arc_target_bytes() >> spa_load_verify_shift; - zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); @@ -2321,10 +2741,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, return (0); } -/* ARGSUSED */ static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { + (void) dp, (void) arg; + if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); @@ -2342,7 +2763,8 @@ spa_load_verify(spa_t *spa) zpool_get_load_policy(spa->spa_config, &policy); - if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) + if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || + policy.zlp_maxmeta == UINT64_MAX) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); @@ -2353,6 +2775,13 @@ spa_load_verify(spa_t *spa) if (error != 0) return (error); + /* + * Verify data only if we are rewinding or error limit was set. + * Otherwise nothing except dbgmsg care about it to waste time. + */ + sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || + (policy.zlp_maxdata < UINT64_MAX); + rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); @@ -2397,6 +2826,8 @@ spa_load_verify(spa_t *spa) fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss); fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; @@ -2454,10 +2885,10 @@ spa_livelist_delete_check(spa_t *spa) return (spa->spa_livelists_to_delete != 0); } -/* ARGSUSED */ static boolean_t spa_livelist_delete_cb_check(void *arg, zthr_t *z) { + (void) z; spa_t *spa = arg; return (spa_livelist_delete_check(spa)); } @@ -2549,7 +2980,6 @@ livelist_delete_sync(void *arg, dmu_tx_t *tx) * be freed. Then, call a synctask which performs the actual frees and updates * the pool-wide livelist data. */ -/* ARGSUSED */ static void spa_livelist_delete_cb(void *arg, zthr_t *z) { @@ -2795,7 +3225,6 @@ spa_livelist_condense_cb(void *arg, zthr_t *t) zfs_livelist_condense_zthr_cancel++; } -/* ARGSUSED */ /* * Check that there is something to condense but that a condense is not * already in progress and that condensing has not been cancelled. @@ -2803,6 +3232,7 @@ spa_livelist_condense_cb(void *arg, zthr_t *t) static boolean_t spa_livelist_condense_cb_check(void *arg, zthr_t *z) { + (void) z; spa_t *spa = arg; if ((spa->spa_to_condense.ds != NULL) && (spa->spa_to_condense.syncing == B_FALSE) && @@ -2833,8 +3263,7 @@ spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - + spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -2931,12 +3360,13 @@ spa_try_repair(spa_t *spa, nvlist_t *config) static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { - char *ereport = FM_EREPORT_ZFS_POOL; + const char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); + spa_import_progress_set_notes(spa, "spa_load()"); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); @@ -2978,6 +3408,12 @@ vdev_count_verify_zaps(vdev_t *vd) spa_t *spa = vd->vdev_spa; uint64_t total = 0; + if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && + vd->vdev_root_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_root_zap)); + } if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, @@ -2995,6 +3431,8 @@ vdev_count_verify_zaps(vdev_t *vd) return (total); } +#else +#define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) #endif /* @@ -3146,18 +3584,23 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. + * Remote host activity check. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, + boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; - hrtime_t import_expire; + hrtime_t import_expire, now; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; @@ -3195,9 +3638,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - while (gethrtime() < import_expire) { - (void) spa_import_progress_set_mmp_check(spa_guid(spa), - NSEC2SEC(import_expire - gethrtime())); + if (importing) { + spa_import_progress_set_notes(spa, "Checking MMP activity, " + "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + } + + int iterations = 0; + while ((now = gethrtime()) < import_expire) { + if (importing && iterations++ % 30 == 0) { + spa_import_progress_set_notes(spa, "Checking MMP " + "activity, %llu ms remaining", + (u_longlong_t)NSEC2MSEC(import_expire - now)); + } + + if (importing) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + } vdev_uberblock_load(rvd, ub, &mmp_label); @@ -3246,7 +3703,7 @@ out: * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { - char *hostname = "<unknown>"; + const char *hostname = "<unknown>"; uint64_t hostid = 0; if (mmp_label) { @@ -3279,11 +3736,66 @@ out: return (error); } +/* + * Called from zfs_ioc_clear for a pool that was suspended + * after failing mmp write checks. + */ +boolean_t +spa_mmp_remote_host_activity(spa_t *spa) +{ + ASSERT(spa_multihost(spa) && spa_suspended(spa)); + + nvlist_t *best_label; + uberblock_t best_ub; + + /* + * Locate the best uberblock on disk + */ + vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); + if (best_label) { + /* + * confirm that the best hostid matches our hostid + */ + if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && + spa_get_hostid(spa) != + fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { + nvlist_free(best_label); + return (B_TRUE); + } + nvlist_free(best_label); + } else { + return (B_TRUE); + } + + if (!MMP_VALID(&best_ub) || + !MMP_FAIL_INT_VALID(&best_ub) || + MMP_FAIL_INT(&best_ub) == 0) { + return (B_TRUE); + } + + if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || + best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { + zfs_dbgmsg("txg mismatch detected during pool clear " + "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + (u_longlong_t)spa->spa_uberblock.ub_txg, + (u_longlong_t)best_ub.ub_txg, + (u_longlong_t)spa->spa_uberblock.ub_timestamp, + (u_longlong_t)best_ub.ub_timestamp); + return (B_TRUE); + } + + /* + * Perform an activity check looking for any remote writer + */ + return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, + B_FALSE) != 0); +} + static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; - char *hostname; + const char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, @@ -3318,8 +3830,8 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type) int parse; vdev_t *rvd; uint64_t pool_guid; - char *comment; - char *compatibility; + const char *comment; + const char *compatibility; /* * Versioning wasn't explicitly added to the label until later, so if @@ -3581,6 +4093,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != 0) { + spa_load_note(spa, "uberblock raidz_reflow_info: " + "state=%u offset=%llu", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + } /* @@ -3599,7 +4117,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, spa->spa_config); + int error = + spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); @@ -3806,6 +4325,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, spa_config_exit(spa, SCL_ALL, FTAG); /* + * If 'zpool import' used a cached config, then the on-disk hostid and + * hostname may be different to the cached config in ways that should + * prevent import. Userspace can't discover this without a scan, but + * we know, so we add these values to LOAD_INFO so the caller can know + * the difference. + * + * Note that we have to do this before the config is regenerated, + * because the new config will have the hostid and hostname for this + * host, in readiness for import. + */ + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) + fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, + fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); + if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) + fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, + fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); + + /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to @@ -4163,6 +4700,7 @@ spa_ld_get_props(spa_t *spa) spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { + nvlist_free(mos_config); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* @@ -4323,7 +4861,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) error = spa_ld_log_spacemaps(spa); if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", + spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } @@ -4354,7 +4892,22 @@ spa_ld_load_dedup_tables(spa_t *spa) } static int -spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) +spa_ld_load_brt(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = brt_load(spa); + if (error != 0) { + spa_load_failed(spa, "brt_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int +spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) { vdev_t *rvd = spa->spa_root_vdev; @@ -4481,7 +5034,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa) int error = 0; ASSERT0(spa->spa_checkpoint_txg); - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), @@ -4721,13 +5275,14 @@ spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, * config stored in the MOS. */ static int -spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) +spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; + hrtime_t load_start = gethrtime(); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); @@ -4773,11 +5328,18 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) } /* + * Drop the namespace lock for the rest of the function. + */ + spa->spa_load_thread = curthread; + mutex_exit(&spa_namespace_lock); + + /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ + spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed @@ -4787,60 +5349,73 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ + spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ + spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) - return (error); + goto fail; /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ + spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve pool properties from the MOS. */ + spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) - return (error); + goto fail; /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ + spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) - return (error); + goto fail; /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ + spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) - return (error); + goto fail; + spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) - return (error); + goto fail; + + spa_import_progress_set_notes(spa, "Loading BRT"); + error = spa_ld_load_brt(spa); + if (error != 0) + goto fail; /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ + spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) - return (error); + goto fail; if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); @@ -4850,8 +5425,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ - return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, - ENOTSUP)); + error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP); + goto fail; } /* @@ -4859,15 +5435,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ + spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) - return (error); + goto fail; /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ + spa_import_progress_set_notes(spa, "Calculating deflated space"); spa_update_dspace(spa); /* @@ -4875,6 +5453,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ + spa_import_progress_set_notes(spa, "Starting import"); if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; @@ -4882,6 +5461,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* + * Before we do any zio_write's, complete the raidz expansion + * scratch space copying, if necessary. + */ + if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) + vdev_raidz_reflow_copy_scratch(spa); + + /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ @@ -4891,6 +5477,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } + spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); /* * Traverse the ZIL and claim all blocks. */ @@ -4910,6 +5497,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * will have been set for us by ZIL traversal operations * performed above. */ + spa_import_progress_set_notes(spa, "Syncing ZIL claims"); txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* @@ -4917,6 +5505,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ + spa_import_progress_set_notes(spa, "Updating configs"); spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); @@ -4925,6 +5514,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ + spa_import_progress_set_notes(spa, "Starting resilvers"); if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && @@ -4938,6 +5528,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) */ spa_history_log_version(spa, "open", NULL); + spa_import_progress_set_notes(spa, + "Restarting device removals"); spa_restart_removal(spa); spa_spawn_aux_threads(spa); @@ -4950,27 +5542,40 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) * auxiliary threads above (from which the livelist * deletion zthr is part of). */ + spa_import_progress_set_notes(spa, + "Cleaning up inconsistent objsets"); (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ + spa_import_progress_set_notes(spa, + "Cleaning up temporary userrefs"); dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_import_progress_set_notes(spa, "Restarting initialize"); vdev_initialize_restart(spa->spa_root_vdev); + spa_import_progress_set_notes(spa, "Restarting TRIM"); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_import_progress_set_notes(spa, "Finished importing"); } + zio_handle_import_delay(spa, gethrtime() - load_start); spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); +fail: + mutex_enter(&spa_namespace_lock); + spa->spa_load_thread = NULL; + cv_broadcast(&spa_namespace_cv); + + return (error); - return (0); } static int @@ -5112,8 +5717,8 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, - nvlist_t **config) +spa_open_common(const char *pool, spa_t **spapp, const void *tag, + nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; @@ -5170,7 +5775,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, */ spa_unload(spa); spa_deactivate(spa); - spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); @@ -5208,7 +5813,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, * If we've recovered the pool, pass back any information we * gathered while doing the load. */ - if (state == SPA_LOAD_RECOVER) { + if (state == SPA_LOAD_RECOVER && config != NULL) { fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } @@ -5229,14 +5834,14 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, } int -spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, - nvlist_t **config) +spa_open_rewind(const char *name, spa_t **spapp, const void *tag, + nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int -spa_open(const char *name, spa_t **spapp, void *tag) +spa_open(const char *name, spa_t **spapp, const void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } @@ -5292,8 +5897,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config) VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); if (nspares != 0) { - fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, - nspares); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + (const nvlist_t * const *)spares, nspares); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares)); @@ -5305,13 +5910,15 @@ spa_add_spares(spa_t *spa, nvlist_t *config) for (i = 0; i < nspares; i++) { guid = fnvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID); + VERIFY0(nvlist_lookup_uint64_array(spares[i], + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { - VERIFY0(nvlist_lookup_uint64_array(spares[i], - ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, - &vsc)); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; + } else { + vs->vs_state = + spa->spa_spares.sav_vdevs[i]->vdev_state; } } } @@ -5340,8 +5947,8 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); if (nl2cache != 0) { - fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, - nl2cache); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + (const nvlist_t * const *)l2cache, nl2cache); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); @@ -5484,7 +6091,7 @@ spa_get_stats(const char *name, nvlist_t **config, fnvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)); + spa_approx_errlog_size(spa)); if (spa_suspended(spa)) { fnvlist_add_uint64(*config, @@ -5648,8 +6255,8 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, fnvlist_remove(sav->sav_config, config); - fnvlist_add_nvlist_array(sav->sav_config, config, newdevs, - ndevs + oldndevs); + fnvlist_add_nvlist_array(sav->sav_config, config, + (const nvlist_t * const *)newdevs, ndevs + oldndevs); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); @@ -5658,7 +6265,8 @@ spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, * Generate a new dev list. */ sav->sav_config = fnvlist_alloc(); - fnvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs); + fnvlist_add_nvlist_array(sav->sav_config, config, + (const nvlist_t * const *)devs, ndevs); } } @@ -5708,7 +6316,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; - char *altroot = NULL; + const char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; @@ -5721,12 +6329,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, boolean_t has_encryption; boolean_t has_allocclass; spa_feature_t feat; - char *feat_name; - char *poolname; + const char *feat_name; + const char *poolname; nvlist_t *nvl; if (props == NULL || - nvlist_lookup_string(props, "tname", &poolname) != 0) + nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) poolname = (char *)pool; /* @@ -5869,7 +6478,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, &spares, &nspares) == 0) { spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares); + ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, + nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -5881,9 +6491,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { - spa->spa_l2cache.sav_config = fnvlist_alloc(); + VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP)); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache); + ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, + nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -5898,6 +6510,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, * Create DDTs (dedup tables). */ ddt_create(spa); + /* + * Create BRT table and BRT table object. + */ + brt_create(spa); spa_update_dspace(spa); @@ -5990,7 +6606,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_spawn_aux_threads(spa); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); /* * Don't count references from objsets that are already closed @@ -6000,6 +6616,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; + spa_import_os(spa); + mutex_exit(&spa_namespace_lock); return (0); @@ -6012,7 +6630,7 @@ int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; - char *altroot = NULL; + const char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; spa_mode_t mode = spa_mode_global; @@ -6051,7 +6669,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (props != NULL) spa_configfile_set(spa, props, B_FALSE); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); @@ -6131,7 +6749,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) else spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares); + ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, + nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -6145,7 +6764,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) else spa->spa_l2cache.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache); + ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, + nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); @@ -6181,6 +6801,8 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) zvol_create_minors_recursive(pool); + spa_import_os(spa); + return (0); } @@ -6188,7 +6810,7 @@ nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; - char *poolname, *cachefile; + const char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; @@ -6203,9 +6825,14 @@ spa_tryimport(nvlist_t *tryconfig) /* * Create and initialize the spa structure. */ + char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", + TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); + mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); + kmem_free(name, MAXPATHLEN); /* * Rewind pool if a max txg was provided. @@ -6228,6 +6855,16 @@ spa_tryimport(nvlist_t *tryconfig) spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } + /* + * spa_import() relies on a pool config fetched by spa_try_import() + * for spare/cache devices. Import flags are not passed to + * spa_tryimport(), which makes it return early due to a missing log + * device and missing retrieving the cache device and spare eventually. + * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch + * the correct configuration regardless of the missing log device. + */ + spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* @@ -6308,8 +6945,9 @@ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { - int error; + int error = 0; spa_t *spa; + hrtime_t export_start = gethrtime(); if (oldconfig) *oldconfig = NULL; @@ -6331,8 +6969,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa->spa_is_exporting = B_TRUE; /* - * Put a hold on the pool, drop the namespace lock, stop async tasks, - * reacquire the namespace lock, and see if we can export. + * Put a hold on the pool, drop the namespace lock, stop async tasks + * and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -6342,10 +6980,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); + spa->spa_export_thread = curthread; spa_close(spa, FTAG); - if (spa->spa_state == POOL_STATE_UNINITIALIZED) + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + mutex_exit(&spa_namespace_lock); goto export_spa; + } + /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, @@ -6366,7 +7008,16 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, goto fail; } + mutex_exit(&spa_namespace_lock); + /* + * At this point we no longer hold the spa_namespace_lock and + * there were no references on the spa. Future spa_lookups will + * notice the spa->spa_export_thread and wait until we signal + * that we are finshed. + */ + if (spa->spa_sync_on) { + vdev_t *rvd = spa->spa_root_vdev; /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare @@ -6376,6 +7027,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); + mutex_enter(&spa_namespace_lock); goto fail; } @@ -6386,13 +7038,10 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ - if (spa->spa_root_vdev != NULL) { - vdev_t *rvd = spa->spa_root_vdev; - vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); - vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); - vdev_autotrim_stop_all(spa); - vdev_rebuild_stop_all(spa); - } + vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); /* * We want this to be reflected on every label, @@ -6402,14 +7051,34 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; + vdev_config_dirty(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + } + + /* + * If the log space map feature is enabled and the pool is + * getting exported (but not destroyed), we want to spend some + * time flushing as many metaslabs as we can in an attempt to + * destroy log space maps and save import time. This has to be + * done before we set the spa_final_txg, otherwise + * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. + * spa_should_flush_logs_on_unload() should be called after + * spa_state has been set to the new_state. + */ + if (spa_should_flush_logs_on_unload(spa)) + spa_unload_log_sm_flush_all(spa); + + if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; - vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } export_spa: + spa_export_os(spa); + if (new_state == POOL_STATE_DESTROYED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); else if (new_state == POOL_STATE_EXPORTED) @@ -6423,9 +7092,16 @@ export_spa: if (oldconfig && spa->spa_config) *oldconfig = fnvlist_dup(spa->spa_config); + if (new_state == POOL_STATE_EXPORTED) + zio_handle_export_delay(spa, gethrtime() - export_start); + + /* + * Take the namespace lock for the actual spa_t removal + */ + mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) - spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); } else { /* @@ -6434,14 +7110,25 @@ export_spa: * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; } + /* + * Wake up any waiters in spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); fail: spa->spa_is_exporting = B_FALSE; + spa->spa_export_thread = NULL; + spa_async_resume(spa); + /* + * Wake up any waiters in spa_lookup() + */ + cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); } @@ -6501,7 +7188,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx) * Add a device to a storage pool. */ int -spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { uint64_t txg, ndraid = 0; int error; @@ -6592,6 +7279,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } } + if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (tvd->vdev_ashift != spa->spa_max_ashift) { + return (spa_vdev_exit(spa, vd, txg, + ZFS_ERR_ASHIFT_MISMATCH)); + } + } + } + for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); @@ -6651,9 +7348,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } /* - * Attach a device to a mirror. The arguments are the path to any device - * in the mirror, and the nvroot for the new device. If the path specifies - * a device that is not mirrored, we automatically insert the mirror vdev. + * Attach a device to a vdev specified by its guid. The vdev type can be + * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a + * single device). When the vdev is a single device, a mirror vdev will be + * automatically inserted. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own @@ -6676,7 +7374,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; - int newvd_isspare; + int newvd_isspare = B_FALSE; int error; ASSERT(spa_writeable(spa)); @@ -6696,28 +7394,49 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - if (dsl_scan_resilvering(spa_get_dsl(spa))) + if (dsl_scan_resilvering(spa_get_dsl(spa)) || + dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); + } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_REBUILD_IN_PROGRESS)); } - if (spa->spa_vdev_removal != NULL) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (spa->spa_vdev_removal != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_DEVRM_IN_PROGRESS)); + } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - if (!oldvd->vdev_ops->vdev_op_leaf) + boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; + + if (raidz) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * Can't expand a raidz while prior expand is in progress. + */ + if (spa->spa_raidz_expand != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + } + } else if (!oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } - pvd = oldvd->vdev_parent; + if (raidz) + pvd = oldvd; + else + pvd = oldvd->vdev_parent; - if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ATTACH)) != 0) + if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ATTACH) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) @@ -6732,10 +7451,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, newrootvd, txg, error)); /* - * Spares can't replace logs + * log, dedup and special vdevs should not be replaced by spares. */ - if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) + if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || + oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } /* * A dRAID spare can only replace a child of its parent dRAID vdev. @@ -6764,11 +7485,13 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (!replacing) { /* - * For attach, the only allowable parent is a mirror or the root - * vdev. + * For attach, the only allowable parent is a mirror or + * the root vdev. A raidz vdev can be attached to, but + * you cannot attach to a raidz child. */ if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) + pvd->vdev_ops != &vdev_root_ops && + !raidz) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; @@ -6807,7 +7530,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; + if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -6818,31 +7542,74 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* + * RAIDZ-expansion-specific checks. + */ + if (raidz) { + if (vdev_raidz_attach_check(newvd) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * Fail early if a child is not healthy or being replaced + */ + for (int i = 0; i < oldvd->vdev_children; i++) { + if (vdev_is_dead(oldvd->vdev_child[i]) || + !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, newrootvd, txg, + ENXIO)); + } + /* Also fail if reserved boot area is in-use */ + if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) + != 0) { + return (spa_vdev_exit(spa, newrootvd, txg, + EADDRINUSE)); + } + } + } + + if (raidz) { + /* + * Note: oldvdpath is freed by spa_strfree(), but + * kmem_asprintf() is freed by kmem_strfree(), so we have to + * move it to a spa_strdup-ed string. + */ + char *tmp = kmem_asprintf("raidz%u-%u", + (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); + oldvdpath = spa_strdup(tmp); + kmem_strfree(tmp); + } else { + oldvdpath = spa_strdup(oldvd->vdev_path); + } + newvdpath = spa_strdup(newvd->vdev_path); + + /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); - (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, - "%s/%s", newvd->vdev_path, "old"); + (void) sprintf(oldvd->vdev_path, "%s/old", + newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } + spa_strfree(oldvdpath); + oldvdpath = spa_strdup(oldvd->vdev_path); } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (pvd->vdev_ops != pvops) + if (!raidz && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + } ASSERT(pvd->vdev_top->vdev_parent == rvd); - ASSERT(pvd->vdev_ops == pvops); - ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. @@ -6870,41 +7637,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, dtl_max_txg - TXG_INITIAL); + if (raidz) { + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - if (newvd->vdev_isspare) { - spa_spare_activate(newvd); - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); - } + vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_wait(tvd); - oldvdpath = spa_strdup(oldvd->vdev_path); - newvdpath = spa_strdup(newvd->vdev_path); - newvd_isspare = newvd->vdev_isspare; + dtl_max_txg = spa_vdev_config_enter(spa); - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); + tvd->vdev_rz_expanding = B_TRUE; - /* - * Schedule the resilver or rebuild to restart in the future. We do - * this to ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. - */ - if (rebuild) { - newvd->vdev_rebuild_txg = txg; + vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); + vdev_config_dirty(tvd); - vdev_rebuild(tvd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + dtl_max_txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, + newvd, tx); + dmu_tx_commit(tx); } else { - newvd->vdev_resilver_txg = txg; + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); + + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } + + newvd_isspare = newvd->vdev_isspare; - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { - vdev_defer_resilver(newvd); + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + /* + * Schedule the resilver or rebuild to restart in the future. + * We do this to ensure that dmu_sync-ed blocks have been + * stitched into the respective datasets. + */ + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); } else { - dsl_scan_restart_resilver(spa->spa_dsl_pool, - dtl_max_txg); + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, + SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } } } @@ -6934,7 +7726,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent - * is a replacing vdev. + * is a replacing or a spare vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) @@ -7073,7 +7865,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * it may be that the unwritability of the disk is the reason * it's being detached! */ - error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. @@ -7229,7 +8021,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || - vd->vdev_top->vdev_removing)) { + vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && @@ -7241,6 +8033,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_UNINIT && + vd->vdev_initialize_thread != NULL) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(EBUSY)); } switch (cmd_type) { @@ -7253,6 +8049,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; + case POOL_INITIALIZE_UNINIT: + vdev_uninitialize(vd); + break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } @@ -7344,7 +8143,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && - (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { + (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && @@ -7432,7 +8232,7 @@ spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, * Split a set of devices from their mirrors, and create a new pool from them. */ int -spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, +spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; @@ -7441,7 +8241,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; - char *altroot = NULL; + const char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; @@ -7986,6 +8786,7 @@ spa_scan_stop(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); + return (dsl_scan_cancel(spa->spa_dsl_pool)); } @@ -8011,6 +8812,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func) return (0); } + if (func == POOL_SCAN_ERRORSCRUB && + !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) + return (SET_ERROR(ENOTSUP)); + return (dsl_scan(spa->spa_dsl_pool, func)); } @@ -8049,15 +8854,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8077,7 +8883,7 @@ spa_async_autoexpand(spa_t *spa, vdev_t *vd) spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } -static void +static __attribute__((noreturn)) void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; @@ -8145,11 +8951,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -8157,7 +8963,8 @@ spa_async_thread(void *arg) * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE || - tasks & SPA_ASYNC_REBUILD_DONE) { + tasks & SPA_ASYNC_REBUILD_DONE || + tasks & SPA_ASYNC_DETACH_SPARE) { spa_vdev_resilver_done(spa); } @@ -8241,6 +9048,10 @@ spa_async_suspend(spa_t *spa) if (condense_thread != NULL) zthr_cancel(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_cancel(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); @@ -8267,6 +9078,10 @@ spa_async_resume(spa_t *spa) if (condense_thread != NULL) zthr_resume(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_resume(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); @@ -8433,7 +9248,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); - bzero(packed + nvsize, bufsize - nvsize); + memset(packed + nvsize, 0, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); @@ -8472,13 +9287,15 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, nvroot = fnvlist_alloc(); if (sav->sav_count == 0) { - fnvlist_add_nvlist_array(nvroot, config, NULL, 0); + fnvlist_add_nvlist_array(nvroot, config, + (const nvlist_t * const *)NULL, 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); - fnvlist_add_nvlist_array(nvroot, config, list, sav->sav_count); + fnvlist_add_nvlist_array(nvroot, config, + (const nvlist_t * const *)list, sav->sav_count); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); @@ -8499,6 +9316,11 @@ spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; + if (vd->vdev_root_zap != 0 && + spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_root_zap, tx)); + } if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); @@ -8659,27 +9481,14 @@ spa_sync_props(void *arg, dmu_tx_t *tx) while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; - char *strval, *fname; + const char *strval, *fname; zpool_prop_t prop; const char *propname; + const char *elemname = nvpair_name(elem); zprop_type_t proptype; spa_feature_t fid; - switch (prop = zpool_name_to_prop(nvpair_name(elem))) { - case ZPOOL_PROP_INVAL: - /* - * We checked this earlier in spa_prop_validate(). - */ - ASSERT(zpool_prop_feature(nvpair_name(elem))); - - fname = strchr(nvpair_name(elem), '@') + 1; - VERIFY0(zfeature_lookup_name(fname, &fid)); - - spa_feature_enable(spa, fid, tx); - spa_history_log_internal(spa, "set", tx, - "%s=enabled", nvpair_name(elem)); - break; - + switch (prop = zpool_name_to_prop(elemname)) { case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* @@ -8722,7 +9531,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, - "%s=%s", nvpair_name(elem), strval); + "%s=%s", elemname, strval); break; case ZPOOL_PROP_COMPATIBILITY: strval = fnvpair_value_string(elem); @@ -8741,6 +9550,20 @@ spa_sync_props(void *arg, dmu_tx_t *tx) "%s=%s", nvpair_name(elem), strval); break; + case ZPOOL_PROP_INVAL: + if (zpool_prop_feature(elemname)) { + fname = strchr(elemname, '@') + 1; + VERIFY0(zfeature_lookup_name(fname, &fid)); + + spa_feature_enable(spa, fid, tx); + spa_history_log_internal(spa, "set", tx, + "%s=enabled", elemname); + break; + } else if (!zfs_prop_user(elemname)) { + ASSERT(zpool_prop_feature(elemname)); + break; + } + zfs_fallthrough; default: /* * Set pool property values in the poolprops mos object. @@ -8753,8 +9576,13 @@ spa_sync_props(void *arg, dmu_tx_t *tx) } /* normalize the property name */ - propname = zpool_prop_to_name(prop); - proptype = zpool_prop_get_type(prop); + if (prop == ZPOOL_PROP_INVAL) { + propname = elemname; + proptype = PROP_TYPE_STRING; + } else { + propname = zpool_prop_to_name(prop); + proptype = zpool_prop_get_type(prop); + } if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); @@ -8763,7 +9591,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, - "%s=%s", nvpair_name(elem), strval); + "%s=%s", elemname, strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); @@ -8776,38 +9604,38 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, - "%s=%lld", nvpair_name(elem), + "%s=%lld", elemname, (longlong_t)intval); - } else { - ASSERT(0); /* not allowed */ - } - switch (prop) { - case ZPOOL_PROP_DELEGATION: - spa->spa_delegation = intval; - break; - case ZPOOL_PROP_BOOTFS: - spa->spa_bootfs = intval; - break; - case ZPOOL_PROP_FAILUREMODE: - spa->spa_failmode = intval; - break; - case ZPOOL_PROP_AUTOTRIM: - spa->spa_autotrim = intval; - spa_async_request(spa, - SPA_ASYNC_AUTOTRIM_RESTART); - break; - case ZPOOL_PROP_AUTOEXPAND: - spa->spa_autoexpand = intval; - if (tx->tx_txg != TXG_INITIAL) + switch (prop) { + case ZPOOL_PROP_DELEGATION: + spa->spa_delegation = intval; + break; + case ZPOOL_PROP_BOOTFS: + spa->spa_bootfs = intval; + break; + case ZPOOL_PROP_FAILUREMODE: + spa->spa_failmode = intval; + break; + case ZPOOL_PROP_AUTOTRIM: + spa->spa_autotrim = intval; spa_async_request(spa, - SPA_ASYNC_AUTOEXPAND); - break; - case ZPOOL_PROP_MULTIHOST: - spa->spa_multihost = intval; - break; - default: - break; + SPA_ASYNC_AUTOTRIM_RESTART); + break; + case ZPOOL_PROP_AUTOEXPAND: + spa->spa_autoexpand = intval; + if (tx->tx_txg != TXG_INITIAL) + spa_async_request(spa, + SPA_ASYNC_AUTOEXPAND); + break; + case ZPOOL_PROP_MULTIHOST: + spa->spa_multihost = intval; + break; + default: + break; + } + } else { + ASSERT(0); /* not allowed */ } } @@ -9045,8 +9873,10 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) &spa->spa_deferred_bpobj, tx); } + brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); + dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); @@ -9057,6 +9887,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) != NULL) vdev_sync(vd, txg); + if (pass == 1) { + /* + * dsl_pool_sync() -> dp_sync_tasks may have dirtied + * the config. If that happens, this txg should not + * be a no-op. So we must sync the config to the MOS + * before checking for no-op. + * + * Note that when the config is dirty, it will + * be written to the MOS (i.e. the MOS will be + * dirtied) every time we call spa_sync_config_object() + * in this txg. Therefore we can't call this after + * dsl_pool_sync() every pass, because it would + * prevent us from converging, since we'd dirty + * the MOS every pass. + * + * Sync tasks can only be processed in pass 1, so + * there's no need to do this in later passes. + */ + spa_sync_config_object(spa, tx); + } + /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock @@ -9067,7 +9918,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) * don't want to rely on that here). */ if (pass == 1 && - spa->spa_uberblock.ub_rootbp.blk_birth < txg && + BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this @@ -9170,6 +10021,13 @@ spa_sync(spa_t *spa, uint64_t txg) ZIO_FLAG_CANFAIL); /* + * Now that there can be no more cloning in this transaction group, + * but we are still before issuing frees, we can process pending BRT + * updates. + */ + brt_pending_apply(spa, txg); + + /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -9188,7 +10046,13 @@ spa_sync(spa_t *spa, uint64_t txg) * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while (list_head(&spa->spa_state_dirty_list) != NULL) { + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + /* Avoid holding the write lock unless actually necessary */ + if (vd->vdev_aux == NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + continue; + } /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. @@ -9304,11 +10168,17 @@ spa_sync(spa_t *spa, uint64_t txg) metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); + /* spa_embedded_log_class has only one metaslab per vdev. */ + metaslab_class_evict_old(spa->spa_special_class, txg); + metaslab_class_evict_old(spa->spa_dedup_class, txg); spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); + if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) + vdev_autotrim_kick(spa); + /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). @@ -9362,6 +10232,132 @@ spa_sync_allpools(void) mutex_exit(&spa_namespace_lock); } +taskq_t * +spa_sync_tq_create(spa_t *spa, const char *name) +{ + kthread_t **kthreads; + + ASSERT(spa->spa_sync_tq == NULL); + ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); + + /* + * - do not allow more allocators than cpus. + * - there may be more cpus than allocators. + * - do not allow more sync taskq threads than allocators or cpus. + */ + int nthreads = spa->spa_alloc_count; + spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * + nthreads, KM_SLEEP); + + spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, + nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); + VERIFY(spa->spa_sync_tq != NULL); + VERIFY(kthreads != NULL); + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0; i < nthreads; i++, ti++) { + ti->sti_thread = kthreads[i]; + ti->sti_allocator = i; + } + + kmem_free(kthreads, sizeof (*kthreads) * nthreads); + return (spa->spa_sync_tq); +} + +void +spa_sync_tq_destroy(spa_t *spa) +{ + ASSERT(spa->spa_sync_tq != NULL); + + taskq_wait(spa->spa_sync_tq); + taskq_destroy(spa->spa_sync_tq); + kmem_free(spa->spa_syncthreads, + sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); + spa->spa_sync_tq = NULL; +} + +uint_t +spa_acq_allocator(spa_t *spa) +{ + int i; + + if (spa->spa_alloc_count == 1) + return (0); + + mutex_enter(&spa->spa_allocs_use->sau_lock); + uint_t r = spa->spa_allocs_use->sau_rotor; + do { + if (++r == spa->spa_alloc_count) + r = 0; + } while (spa->spa_allocs_use->sau_inuse[r]); + spa->spa_allocs_use->sau_inuse[r] = B_TRUE; + spa->spa_allocs_use->sau_rotor = r; + mutex_exit(&spa->spa_allocs_use->sau_lock); + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + ti->sti_allocator = r; + break; + } + } + ASSERT3S(i, <, spa->spa_alloc_count); + return (r); +} + +void +spa_rel_allocator(spa_t *spa, uint_t allocator) +{ + if (spa->spa_alloc_count > 1) + spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; +} + +void +spa_select_allocator(zio_t *zio) +{ + zbookmark_phys_t *bm = &zio->io_bookmark; + spa_t *spa = zio->io_spa; + + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * A gang block (for example) may have inherited its parent's + * allocator, in which case there is nothing further to do here. + */ + if (ZIO_HAS_ALLOCATOR(zio)) + return; + + ASSERT(spa != NULL); + ASSERT(bm != NULL); + + /* + * First try to use an allocator assigned to the syncthread, and set + * the corresponding write issue taskq for the allocator. + * Note, we must have an open pool to do this. + */ + if (spa->spa_sync_tq != NULL) { + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + zio->io_allocator = ti->sti_allocator; + return; + } + } + } + + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, + bm->zb_blkid >> 20); + + zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; +} + /* * ========================================================================== * Miscellaneous routines @@ -9454,6 +10450,7 @@ spa_upgrade(spa_t *spa, uint64_t version) static boolean_t spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) { + (void) spa; int i; uint64_t vdev_guid; @@ -9699,9 +10696,10 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: - if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + *in_progress = vdev_rebuild_active(spa->spa_root_vdev); + if (*in_progress) break; - fallthrough; + zfs_fallthrough; case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; @@ -9714,6 +10712,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } + case ZPOOL_WAIT_RAIDZ_EXPAND: + { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); + break; + } default: panic("unrecognized value for activity %d", activity); } @@ -9817,6 +10821,8 @@ spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); ev->resource = resource; } +#else + (void) spa, (void) vd, (void) hist_nvl, (void) name; #endif return (ev); } @@ -9829,6 +10835,8 @@ spa_event_post(sysevent_t *ev) zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); kmem_free(ev, sizeof (*ev)); } +#else + (void) ev; #endif } @@ -9900,10 +10908,14 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, + "Percentage of CPUs to run a metaslab preload taskq"); + /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); +/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); @@ -9914,29 +10926,47 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, "Percentage of CPUs to run an IO worker thread"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); -ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); +/* END CSTYLED */ -ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, - "Set the livelist condense zthr to pause"); +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, + ZMOD_RW, "Set the livelist condense zthr to pause"); -ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, - "Set the livelist condense synctask to pause"); +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, + ZMOD_RW, "Set the livelist condense synctask to pause"); -ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, + INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); -ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, + INT, ZMOD_RW, "Whether livelist condensing was canceled in the zthr function"); -ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, + ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, + "Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, + "Configure IO queues for write IO"); +#endif /* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, + "Number of CPUs per write issue taskq"); diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c index 09f62996853d..1efff47f87a0 100644 --- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c +++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -158,7 +158,7 @@ * amount of checkpointed data that has been freed within them while * the pool had a checkpoint. */ -unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024; +static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; int spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) @@ -166,7 +166,7 @@ spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); - bzero(pcs, sizeof (pool_checkpoint_stat_t)); + memset(pcs, 0, sizeof (pool_checkpoint_stat_t)); int error = zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); @@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) if (error != 0) { zfs_panic_recover("zfs: error %lld was returned " "while incrementally destroying the checkpoint " - "space map of vdev %u\n", + "space map of vdev %llu\n", (longlong_t)error, vd->vdev_id); } ASSERT0(words_after); @@ -380,10 +380,10 @@ spa_checkpoint_discard_is_done(spa_t *spa) return (B_TRUE); } -/* ARGSUSED */ boolean_t spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) { + (void) zthr; spa_t *spa = arg; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) @@ -450,10 +450,10 @@ spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) } -/* ARGSUSED */ static int spa_checkpoint_check(void *arg, dmu_tx_t *tx) { + (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) @@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx) if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + if (spa->spa_raidz_expand != NULL) + return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); @@ -474,10 +477,10 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void spa_checkpoint_sync(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; uberblock_t checkpoint = spa->spa_ubsync; @@ -571,10 +574,10 @@ spa_checkpoint(const char *pool) return (error); } -/* ARGSUSED */ static int spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) { + (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) @@ -589,10 +592,10 @@ spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) return (0); } -/* ARGSUSED */ static void spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) { + (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, @@ -631,7 +634,7 @@ EXPORT_SYMBOL(spa_checkpoint_discard_thread); EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW, "Limit for memory used in prefetching the checkpoint space map done " "on each vdev while discarding the checkpoint"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index ad82932ce567..a77874ea0dd3 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -67,8 +67,10 @@ static uint64_t spa_config_generation = 1; * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ -char *spa_config_path = ZPOOL_CACHE; -int zfs_autoimport_disable = 1; +char *spa_config_path = (char *)ZPOOL_CACHE; +#ifdef _KERNEL +static int zfs_autoimport_disable = B_TRUE; +#endif /* * Called when the module is first loaded, this routine loads the configuration @@ -238,11 +240,12 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) * would be required. */ void -spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) +spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, + boolean_t postblkidevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; - char *pool_name; + const char *pool_name; boolean_t ccw_failure; int error = 0; @@ -344,6 +347,18 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); + + /* + * Post udev event to sync blkid information if the pool is created + * or a new vdev is added to the pool. + */ + if ((target->spa_root_vdev) && postblkidevent) { + vdev_post_kobj_evt(target->spa_root_vdev); + for (int i = 0; i < target->spa_l2cache.sav_count; i++) + vdev_post_kobj_evt(target->spa_l2cache.sav_vdevs[i]); + for (int i = 0; i < target->spa_spares.sav_count; i++) + vdev_post_kobj_evt(target->spa_spares.sav_vdevs[i]); + } } /* @@ -352,23 +367,24 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ -nvlist_t * -spa_all_configs(uint64_t *generation) +int +spa_all_configs(uint64_t *generation, nvlist_t **pools) { - nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) - return (NULL); + return (SET_ERROR(EEXIST)); - pools = fnvlist_alloc(); + int error = mutex_enter_interruptible(&spa_namespace_lock); + if (error) + return (SET_ERROR(EINTR)); - mutex_enter(&spa_namespace_lock); + *pools = fnvlist_alloc(); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); - fnvlist_add_nvlist(pools, spa_name(spa), + fnvlist_add_nvlist(*pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } @@ -376,7 +392,7 @@ spa_all_configs(uint64_t *generation) *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); - return (pools); + return (0); } void @@ -403,7 +419,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; - char *pool_name; + const char *pool_name; if (vd == NULL) { vd = rvd; @@ -598,6 +614,7 @@ spa_config_update(spa_t *spa, int what) */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, + what != SPA_CONFIG_UPDATE_POOL, what != SPA_CONFIG_UPDATE_POOL); } @@ -611,7 +628,6 @@ EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); EXPORT_SYMBOL(spa_config_update); -/* BEGIN CSTYLED */ #ifdef __linux__ /* string sysctls require a char array on FreeBSD */ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, @@ -620,4 +636,3 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, "Disable pool import at module load"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c index fa5120eb61b3..62d7b4fa2df2 100644 --- a/sys/contrib/openzfs/module/zfs/spa_errlog.c +++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,7 +20,9 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014, Delphix. All rights reserved. + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2021, 2022, George Amanakis. All rights reserved. */ /* @@ -43,6 +45,16 @@ * calculation when the data is requested, storing the result so future queries * will be faster. * + * If the head_errlog feature is enabled, a different on-disk format is used. + * The error log of each head dataset is stored separately in the zap object + * and keyed by the head id. This enables listing every dataset affected in + * userland. In order to be able to track whether an error block has been + * modified or added to snapshots since it was marked as an error, a new tuple + * is introduced: zbookmark_err_phys_t. It allows the storage of the birth + * transaction group of an error block on-disk. The birth transaction group is + * used by check_filesystem() to assess whether this block was freed, + * re-written or added to a snapshot since its marking as an error. + * * This log is then shipped into an nvlist where the key is the dataset name and * the value is the object name. Userland is then responsible for uniquifying * this list and displaying it to the user. @@ -53,7 +65,25 @@ #include <sys/spa_impl.h> #include <sys/zap.h> #include <sys/zio.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_objset.h> +#include <sys/dbuf.h> +#include <sys/zfs_znode.h> + +#define NAME_MAX_LEN 64 + +typedef struct clones { + uint64_t clone_ds; + list_node_t node; +} clones_t; +/* + * spa_upgrade_errlog_limit : A zfs module parameter that controls the number + * of on-disk error log entries that will be converted to the new + * format when enabling head_errlog. Defaults to 0 which converts + * all log entries. + */ +static uint_t spa_upgrade_errlog_limit = 0; /* * Convert a bookmark to a string. @@ -67,9 +97,35 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) } /* - * Convert a string to a bookmark + * Convert an err_phys to a string. + */ +static void +errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, + (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); +} + +/* + * Convert a string to a err_phys. + */ +void +name_to_errphys(char *buf, zbookmark_err_phys_t *zep) +{ + zep->zb_object = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_birth = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +/* + * Convert a string to a bookmark. */ -#ifdef _KERNEL static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { @@ -82,7 +138,41 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } -#endif + +void +zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) +{ + zb->zb_objset = dataset; + zb->zb_object = zep->zb_object; + zb->zb_level = zep->zb_level; + zb->zb_blkid = zep->zb_blkid; +} + +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +/* + * Retrieve the head filesystem. + */ +static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) +{ + dsl_dataset_t *ds; + int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, + dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + + if (error != 0) + return (error); + + ASSERT(head_ds); + *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + + return (error); +} /* * Log an uncorrectable error to the persistent error log. We add it to the @@ -90,7 +180,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) * during spa_errlog_sync(). */ void -spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) +spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth) { spa_error_entry_t search; spa_error_entry_t *new; @@ -123,96 +213,856 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *zb; - avl_insert(tree, new, where); + /* + * If the head_errlog feature is enabled, store the birth txg now. In + * case the file is deleted before spa_errlog_sync() runs, we will not + * be able to retrieve the birth txg. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + new->se_zep.zb_object = zb->zb_object; + new->se_zep.zb_level = zb->zb_level; + new->se_zep.zb_blkid = zb->zb_blkid; + new->se_zep.zb_birth = birth; + } + + avl_insert(tree, new, where); mutex_exit(&spa->spa_errlist_lock); } +int +find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg) +{ + objset_t *os; + int error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + dnode_t *dn; + blkptr_t bp; + + error = dnode_hold(os, zep->zb_object, FTAG, &dn); + if (error != 0) + return (error); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + if (error == 0 && BP_IS_HOLE(&bp)) + error = SET_ERROR(ENOENT); + + *birth_txg = BP_GET_LOGICAL_BIRTH(&bp); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); +} + +/* + * This function finds the oldest affected filesystem containing an error + * block. + */ +int +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *top_affected_fs) +{ + uint64_t oldest_dsobj; + int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, + &oldest_dsobj); + if (error != 0) + return (error); + + dsl_dataset_t *ds; + error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (error != 0) + return (error); + + *top_affected_fs = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + return (0); +} + + +#ifdef _KERNEL +/* + * Copy the bookmark to the end of the user-space buffer which starts at + * uaddr and has *count unused entries, and decrement *count by 1. + */ +static int +copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) +{ + if (*count == 0) + return (SET_ERROR(ENOMEM)); + + *count -= 1; + if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); + return (0); +} + /* - * Return the number of errors currently in the error log. This is actually the - * sum of both the last log and the current log, since we don't know the union - * of these logs until we reach userland. + * Each time the error block is referenced by a snapshot or clone, add a + * zbookmark_phys_t entry to the userspace array at uaddr. The array is + * filled from the back and the in-out parameter *count is modified to be the + * number of unused entries at the beginning of the array. The function + * scrub_filesystem() is modelled after this one. */ +static int +check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + void *uaddr, uint64_t *count, list_t *clones_list) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj_flags(dp, head_ds, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + error = find_birth_txg(ds, zep, &latest_txg); + + /* + * If find_birth_txg() errors out otherwise, let txg_to_consider be + * equal to the spa's syncing txg: if check_filesystem() errors out + * then affected snapshots or clones will not be checked. + */ + if (error == 0 && zep->zb_birth == latest_txg) { + /* Block neither free nor rewritten. */ + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + return (error); + } + check_snapshot = B_FALSE; + } else if (error == 0) { + txg_to_consider = latest_txg; + } + + /* + * Retrieve the number of snapshots if the dataset is not a snapshot. + */ + uint64_t snap_count = 0; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + + if (error != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + return (error); + } + } + + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + return (0); + } + + uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t), + KM_SLEEP); + + int aff_snap_count = 0; + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; + + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + error = dsl_dataset_hold_obj_flags(dp, snap_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (error != 0) + goto out; + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + continue; + } + + boolean_t affected = B_TRUE; + if (check_snapshot) { + uint64_t blk_txg; + error = find_birth_txg(ds, zep, &blk_txg); + affected = (error == 0 && zep->zb_birth == blk_txg); + } + + /* Report errors in snapshots. */ + if (affected) { + snap_obj_array[aff_snap_count] = snap_obj; + aff_snap_count++; + + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, + FTAG); + goto out; + } + } + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + } + + if (zap_clone == 0 || aff_snap_count == 0) { + error = 0; + goto out; + } + + /* Check clones. */ + zap_cursor_t *zc; + zap_attribute_t *za; + + zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + + dsl_dataset_t *clone; + error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer, + DS_HOLD_FLAG_DECRYPT, FTAG, &clone); + + if (error != 0) + break; + + /* + * Only clones whose origins were affected could also + * have affected snapshots. + */ + boolean_t found = B_FALSE; + for (int i = 0; i < snap_count; i++) { + if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj + == snap_obj_array[i]) + found = B_TRUE; + } + dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG); + + if (!found) + continue; + + clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP); + ct->clone_ds = za->za_first_integer; + list_insert_tail(clones_list, ct); + } + + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); + +out: + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); + return (error); +} + +static int +process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + void *uaddr, uint64_t *count) +{ + /* + * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the + * birth txg or the head filesystem of the block pointer. This may + * happen e.g. when an encrypted filesystem is not mounted or when + * the key is not loaded. In this case do not proceed to + * check_filesystem(), instead do the accounting here. + */ + if (zep->zb_birth == 0 || head_ds == 0) { + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + int error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + return (error); + } + return (0); + } + + uint64_t top_affected_fs; + uint64_t init_count = *count; + int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); + if (error == 0) { + clones_t *ct; + list_t clones_list; + + list_create(&clones_list, sizeof (clones_t), + offsetof(clones_t, node)); + + error = check_filesystem(spa, top_affected_fs, zep, + uaddr, count, &clones_list); + + while ((ct = list_remove_head(&clones_list)) != NULL) { + error = check_filesystem(spa, ct->clone_ds, zep, + uaddr, count, &clones_list); + kmem_free(ct, sizeof (*ct)); + + if (error) { + while (!list_is_empty(&clones_list)) { + ct = list_remove_head(&clones_list); + kmem_free(ct, sizeof (*ct)); + } + break; + } + } + + list_destroy(&clones_list); + } + if (error == 0 && init_count == *count) { + /* + * If we reach this point, no errors have been detected + * in the checked filesystems/snapshots. Before returning mark + * the error block to be removed from the error lists and logs. + */ + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + spa_remove_error(spa, &zb, zep->zb_birth); + } + + return (error); +} +#endif + +/* Return the number of errors in the error log */ uint64_t -spa_get_errlog_size(spa_t *spa) +spa_get_last_errlog_size(spa_t *spa) { uint64_t total = 0, count; - mutex_enter(&spa->spa_errlog_lock); - if (spa->spa_errlog_scrub != 0 && - zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, - &count) == 0) - total += count; - if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + if (spa->spa_errlog_last != 0 && zap_count(spa->spa_meta_objset, spa->spa_errlog_last, &count) == 0) total += count; mutex_exit(&spa->spa_errlog_lock); + return (total); +} + +/* + * If a healed bookmark matches an entry in the error log we stash it in a tree + * so that we can later remove the related log entries in sync context. + */ +static void +spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, + const uint64_t birth) +{ + char name[NAME_MAX_LEN]; + + if (obj == 0) + return; + + boolean_t held_list = B_FALSE; + boolean_t held_log = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(healed_zb, name, sizeof (name)); + + if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + /* + * Found an error matching healed zb, add zb to our + * tree of healed errors + */ + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + return; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + avl_insert(tree, new, where); + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + } + return; + } + + zbookmark_err_phys_t healed_zep; + healed_zep.zb_object = healed_zb->zb_object; + healed_zep.zb_level = healed_zb->zb_level; + healed_zep.zb_blkid = healed_zb->zb_blkid; + healed_zep.zb_birth = birth; + + errphys_to_name(&healed_zep, name, sizeof (name)); + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + if (zap_contains(spa->spa_meta_objset, za.za_first_integer, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + continue; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + new->se_zep = healed_zep; + avl_insert(tree, new, where); + + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + } + } + zap_cursor_fini(&zc); +} + +/* + * If this error exists in the given tree remove it. + */ +static void +remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb) +{ + spa_error_entry_t search, *found; + avl_index_t where; + + mutex_enter(&spa->spa_errlist_lock); + search.se_bookmark = *zb; + if ((found = avl_find(t, &search, &where)) != NULL) { + avl_remove(t, found); + kmem_free(found, sizeof (spa_error_entry_t)); + } + mutex_exit(&spa->spa_errlist_lock); +} + + +/* + * Removes all of the recv healed errors from both on-disk error logs + */ +static void +spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) +{ + char name[NAME_MAX_LEN]; + spa_error_entry_t *se; + void *cookie = NULL; + + ASSERT(MUTEX_HELD(&spa->spa_errlog_lock)); + + while ((se = avl_destroy_nodes(&spa->spa_errlist_healed, + &cookie)) != NULL) { + remove_error_from_list(spa, s, &se->se_bookmark); + remove_error_from_list(spa, l, &se->se_bookmark); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(&se->se_bookmark, name, sizeof (name)); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_last, name, tx); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_scrub, name, tx); + } else { + errphys_to_name(&se->se_zep, name, sizeof (name)); + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_scrub); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + } + kmem_free(se, sizeof (spa_error_entry_t)); + } +} + +/* + * Stash away healed bookmarks to remove them from the on-disk error logs + * later in spa_remove_healed_errors(). + */ +void +spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth) +{ + spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth); + spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth); +} + +static uint64_t +approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj) +{ + if (spa_err_obj == 0) + return (0); + uint64_t total = 0; + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + uint64_t count; + if (zap_count(spa->spa_meta_objset, za.za_first_integer, + &count) == 0) + total += count; + } + zap_cursor_fini(&zc); + return (total); +} + +/* + * Return the approximate number of errors currently in the error log. This + * will be nonzero if there are some errors, but otherwise it may be more + * or less than the number of entries returned by spa_get_errlog(). + */ +uint64_t +spa_approx_errlog_size(spa_t *spa) +{ + uint64_t total = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + mutex_enter(&spa->spa_errlog_lock); + uint64_t count; + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + } else { + mutex_enter(&spa->spa_errlog_lock); + total += approx_errlog_size_impl(spa, spa->spa_errlog_last); + total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub); + mutex_exit(&spa->spa_errlog_lock); + } mutex_enter(&spa->spa_errlist_lock); total += avl_numnodes(&spa->spa_errlist_last); total += avl_numnodes(&spa->spa_errlist_scrub); mutex_exit(&spa->spa_errlist_lock); - return (total); } -#ifdef _KERNEL -static int -process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +/* + * This function sweeps through an on-disk error log and stores all bookmarks + * as error bookmarks in a new ZAP object. At the end we discard the old one, + * and spa_update_errlog() will set the spa's on-disk error log to new ZAP + * object. + */ +static void +sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, + dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; zbookmark_phys_t zb; + uint64_t count; - if (obj == 0) - return (0); + *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); + + /* + * If we cannnot perform the upgrade we should clear the old on-disk + * error logs. + */ + if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) { + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); + return; + } - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + if (spa_upgrade_errlog_limit != 0 && + zc.zc_cd == spa_upgrade_errlog_limit) + break; - if (*count == 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(ENOMEM)); + name_to_bookmark(za.za_name, &zb); + + zbookmark_err_phys_t zep; + zep.zb_object = zb.zb_object; + zep.zb_level = zb.zb_level; + zep.zb_blkid = zb.zb_blkid; + zep.zb_birth = 0; + + /* + * In case of an error we should simply continue instead of + * returning prematurely. See the next comment. + */ + uint64_t head_ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *ds; + objset_t *os; + + int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (error != 0) + continue; + + head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + + /* + * The objset and the dnode are required for getting the block + * pointer, which is used to determine if BP_IS_HOLE(). If + * getting the objset or the dnode fails, do not create a + * zap entry (presuming we know the dataset) as this may create + * spurious errors that we cannot ever resolve. If an error is + * truly persistent, it should re-appear after a scan. + */ + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + continue; } - name_to_bookmark(za.za_name, &zb); + dnode_t *dn; + blkptr_t bp; - if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(EFAULT)); + if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + continue; } - *count -= 1; - } + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp, + NULL, NULL); + if (error == EACCES) + error = 0; + else if (!error) + zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp); + + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + + if (error != 0 || BP_IS_HOLE(&bp)) + continue; + + uint64_t err_obj; + error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, + head_ds, &err_obj); + + if (error == ENOENT) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *newobj, head_ds, err_obj, tx); + } + char buf[64]; + errphys_to_name(&zep, buf, sizeof (buf)); + + const char *name = ""; + (void) zap_update(spa->spa_meta_objset, err_obj, + buf, 1, strlen(name) + 1, name, tx); + } zap_cursor_fini(&zc); - return (0); + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); +} + +void +spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t newobj = 0; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_last != 0) { + sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); + spa->spa_errlog_last = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, + sizeof (uint64_t), 1, &spa->spa_errlog_last, tx); + } + + if (spa->spa_errlog_scrub != 0) { + sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); + spa->spa_errlog_scrub = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, + sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx); + } + + mutex_exit(&spa->spa_errlog_lock); } +#ifdef _KERNEL +/* + * If an error block is shared by two datasets it will be counted twice. + */ static int -process_error_list(avl_tree_t *list, void *addr, size_t *count) +process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { - spa_error_entry_t *se; + if (obj == 0) + return (0); - for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + zap_cursor_t *zc; + zap_attribute_t *za; + + zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + if (*count == 0) { + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); + return (SET_ERROR(ENOMEM)); + } + + zbookmark_phys_t zb; + name_to_bookmark(za->za_name, &zb); + + int error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); + return (error); + } + } + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); + return (0); + } - if (*count == 0) - return (SET_ERROR(ENOMEM)); + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; + + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + uint64_t head_ds_err_obj = za->za_first_integer; + uint64_t head_ds; + name_to_object(za->za_name, &head_ds); + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { + + zbookmark_err_phys_t head_ds_block; + name_to_errphys(head_ds_attr->za_name, &head_ds_block); + int error = process_error_block(spa, head_ds, + &head_ds_block, uaddr, count); + + if (error != 0) { + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, + sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); + return (error); + } + } + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + } + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); + return (0); +} - if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) - return (SET_ERROR(EFAULT)); +static int +process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) +{ + spa_error_entry_t *se; - *count -= 1; + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (se = avl_first(list); se != NULL; + se = AVL_NEXT(list, se)) { + int error = + copyout_entry(&se->se_bookmark, uaddr, count); + if (error != 0) { + return (error); + } + } + return (0); } + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + uint64_t head_ds = 0; + int error = get_head_ds(spa, se->se_bookmark.zb_objset, + &head_ds); + + /* + * If get_head_ds() errors out, set the head filesystem + * to the filesystem stored in the bookmark of the + * error block. + */ + if (error != 0) + head_ds = se->se_bookmark.zb_objset; + + error = process_error_block(spa, head_ds, + &se->se_zep, uaddr, count); + if (error != 0) + return (error); + } return (0); } #endif @@ -229,11 +1079,18 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) * the error list lock when we are finished. */ int -spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) { int ret = 0; #ifdef _KERNEL + /* + * The pool config lock is needed to hold a dataset_t via (among other + * places) process_error_list() -> process_error_block()-> + * find_top_affected_fs(), and lock ordering requires that we get it + * before the spa_errlog_lock. + */ + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); @@ -244,14 +1101,17 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) mutex_enter(&spa->spa_errlist_lock); if (!ret) - ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, count); if (!ret) - ret = process_error_list(&spa->spa_errlist_last, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, count); mutex_exit(&spa->spa_errlist_lock); mutex_exit(&spa->spa_errlog_lock); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); +#else + (void) spa, (void) uaddr, (void) count; #endif return (ret); @@ -297,35 +1157,89 @@ spa_errlog_drain(spa_t *spa) /* * Process a list of errors into the current on-disk log. */ -static void +void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) { spa_error_entry_t *se; - char buf[64]; + char buf[NAME_MAX_LEN]; void *cookie; - if (avl_numnodes(t) != 0) { - /* create log if necessary */ - if (*obj == 0) - *obj = zap_create(spa->spa_meta_objset, - DMU_OT_ERROR_LOG, DMU_OT_NONE, - 0, tx); + if (avl_numnodes(t) == 0) + return; + + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); - /* add errors to the current log */ + /* add errors to the current log */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { - char *name = se->se_name ? se->se_name : ""; - bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + const char *name = se->se_name ? se->se_name : ""; + (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, + strlen(name) + 1, name, tx); + } + } else { + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_zep.zb_object; + zep.zb_level = se->se_zep.zb_level; + zep.zb_blkid = se->se_zep.zb_blkid; + zep.zb_birth = se->se_zep.zb_birth; + + uint64_t head_ds = 0; + int error = get_head_ds(spa, se->se_bookmark.zb_objset, + &head_ds); + + /* + * If get_head_ds() errors out, set the head filesystem + * to the filesystem stored in the bookmark of the + * error block. + */ + if (error != 0) + head_ds = se->se_bookmark.zb_objset; + + uint64_t err_obj; + error = zap_lookup_int_key(spa->spa_meta_objset, + *obj, head_ds, &err_obj); + + if (error == ENOENT) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *obj, head_ds, err_obj, tx); + } + errphys_to_name(&zep, buf, sizeof (buf)); + + const char *name = se->se_name ? se->se_name : ""; (void) zap_update(spa->spa_meta_objset, - *obj, buf, 1, strlen(name) + 1, name, tx); + err_obj, buf, 1, strlen(name) + 1, name, tx); } + } + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); +} - /* purge the error list */ - cookie = NULL; - while ((se = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); +static void +delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY0(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); } + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); } /* @@ -352,6 +1266,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && avl_numnodes(&spa->spa_errlist_last) == 0 && + avl_numnodes(&spa->spa_errlist_healed) == 0 && !spa->spa_scrub_finished) { mutex_exit(&spa->spa_errlist_lock); return; @@ -362,11 +1277,23 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) spa->spa_scrub_finished = B_FALSE; mutex_exit(&spa->spa_errlist_lock); + + /* + * The pool config lock is needed to hold a dataset_t via + * sync_error_list() -> get_head_ds(), and lock ordering + * requires that we get it before the spa_errlog_lock. + */ + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); /* + * Remove healed errors from errors. + */ + spa_remove_healed_errors(spa, &last, &scrub, tx); + + /* * Sync out the current list of errors. */ sync_error_list(spa, &last, &spa->spa_errlog_last, tx); @@ -376,8 +1303,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (scrub_finished) { if (spa->spa_errlog_last != 0) - VERIFY(dmu_object_free(spa->spa_meta_objset, - spa->spa_errlog_last, tx) == 0); + delete_errlog(spa, spa->spa_errlog_last, tx); spa->spa_errlog_last = spa->spa_errlog_scrub; spa->spa_errlog_scrub = 0; @@ -402,15 +1328,163 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) dmu_tx_commit(tx); mutex_exit(&spa->spa_errlog_lock); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); +} + +static void +delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, + dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + if (head_ds == ds) { + (void) zap_remove(spa->spa_meta_objset, spa_err_obj, + za.za_name, tx); + VERIFY0(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx)); + break; + } + } + zap_cursor_fini(&zc); +} + +void +spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); + delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); + mutex_exit(&spa->spa_errlog_lock); +} + +static int +find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, + uint64_t *txg) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj_flags(dp, old_head, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (prev_obj != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 && + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) + break; + + if (error != 0) + return (error); + + prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + ASSERT(prev_obj != 0); + *txg = prev_obj_txg; + return (0); +} + +static void +swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t + old_head, dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + uint64_t old_head_errlog; + int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, + old_head, &old_head_errlog); + + /* If no error log, then there is nothing to do. */ + if (error != 0) + return; + + uint64_t txg; + error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg); + if (error != 0) + return; + + /* + * Create an error log if the file system being promoted does not + * already have one. + */ + uint64_t new_head_errlog; + error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, + &new_head_errlog); + + if (error != 0) { + new_head_errlog = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, + new_head, new_head_errlog, tx); + } + + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_err_phys_t err_block; + for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + const char *name = ""; + name_to_errphys(za.za_name, &err_block); + if (err_block.zb_birth < txg) { + (void) zap_update(spa->spa_meta_objset, new_head_errlog, + za.za_name, 1, strlen(name) + 1, name, tx); + + (void) zap_remove(spa->spa_meta_objset, old_head_errlog, + za.za_name, tx); + } + } + zap_cursor_fini(&zc); +} + +void +spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, + dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); + swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); + mutex_exit(&spa->spa_errlog_lock); } #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); -EXPORT_SYMBOL(spa_get_errlog_size); +EXPORT_SYMBOL(spa_approx_errlog_size); +EXPORT_SYMBOL(spa_get_last_errlog_size); EXPORT_SYMBOL(spa_get_errlog); EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); EXPORT_SYMBOL(spa_errlog_sync); EXPORT_SYMBOL(spa_get_errlists); +EXPORT_SYMBOL(spa_delete_dataset_errlog); +EXPORT_SYMBOL(spa_swap_errlog); +EXPORT_SYMBOL(sync_error_list); +EXPORT_SYMBOL(spa_upgrade_errlog); +EXPORT_SYMBOL(find_top_affected_fs); +EXPORT_SYMBOL(find_birth_txg); +EXPORT_SYMBOL(zep_to_zb); +EXPORT_SYMBOL(name_to_errphys); #endif + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW, + "Limit the number of errors which will be upgraded to the new " + "on-disk error log when enabling head_errlog"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c index dae06e46c316..de036d6c3718 100644 --- a/sys/contrib/openzfs/module/zfs/spa_history.c +++ b/sys/contrib/openzfs/module/zfs/spa_history.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -199,7 +199,7 @@ spa_history_log_notify(spa_t *spa, nvlist_t *nvl) { nvlist_t *hist_nvl = fnvlist_alloc(); uint64_t uint64; - char *string; + const char *string; if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string); @@ -248,7 +248,6 @@ spa_history_log_notify(spa_t *spa, nvlist_t *nvl) /* * Write out a history event. */ -/*ARGSUSED*/ static void spa_history_log_sync(void *arg, dmu_tx_t *tx) { diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c index 6fd302b8df34..32158e8c592c 100644 --- a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c +++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -177,7 +177,7 @@ * block size as we expect to be writing a lot of data to them at * once. */ -unsigned long zfs_log_sm_blksz = 1ULL << 17; +static const unsigned long zfs_log_sm_blksz = 1ULL << 17; /* * Percentage of the overall system's memory that ZFS allows to be @@ -188,13 +188,13 @@ unsigned long zfs_log_sm_blksz = 1ULL << 17; * (thus the _ppm suffix; reads as "parts per million"). As an example, * the default of 1000 allows 0.1% of memory to be used. */ -unsigned long zfs_unflushed_max_mem_ppm = 1000; +static uint64_t zfs_unflushed_max_mem_ppm = 1000; /* * Specific hard-limit in memory that ZFS allows to be used for * unflushed changes. */ -unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; +static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30; /* * The following tunable determines the number of blocks that can be used for @@ -243,28 +243,33 @@ unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; * provide upper and lower bounds for the log block limit. * [see zfs_unflushed_log_block_{min,max}] */ -unsigned long zfs_unflushed_log_block_pct = 400; +static uint_t zfs_unflushed_log_block_pct = 400; /* * If the number of metaslabs is small and our incoming rate is high, we could * get into a situation that we are flushing all our metaslabs every TXG. Thus * we always allow at least this many log blocks. */ -unsigned long zfs_unflushed_log_block_min = 1000; +static uint64_t zfs_unflushed_log_block_min = 1000; /* * If the log becomes too big, the import time of the pool can take a hit in * terms of performance. Thus we have a hard limit in the size of the log in * terms of blocks. */ -unsigned long zfs_unflushed_log_block_max = (1ULL << 18); +static uint64_t zfs_unflushed_log_block_max = (1ULL << 17); + +/* + * Also we have a hard limit in the size of the log in terms of dirty TXGs. + */ +static uint64_t zfs_unflushed_log_txg_max = 1000; /* * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and * stability of the flushing algorithm (longer summary) vs its runtime overhead * (smaller summary is faster to traverse). */ -unsigned long zfs_max_logsm_summary_length = 10; +static uint64_t zfs_max_logsm_summary_length = 10; /* * Tunable that sets the lower bound on the metaslabs to flush every TXG. @@ -277,7 +282,7 @@ unsigned long zfs_max_logsm_summary_length = 10; * The point of this tunable is to be used in extreme cases where we really * want to flush more metaslabs than our adaptable heuristic plans to flush. */ -unsigned long zfs_min_metaslabs_to_flush = 1; +static uint64_t zfs_min_metaslabs_to_flush = 1; /* * Tunable that specifies how far in the past do we want to look when trying to @@ -288,7 +293,7 @@ unsigned long zfs_min_metaslabs_to_flush = 1; * average over all the blocks that we walk * [see spa_estimate_incoming_log_blocks]. */ -unsigned long zfs_max_log_walking = 5; +static uint64_t zfs_max_log_walking = 5; /* * This tunable exists solely for testing purposes. It ensures that the log @@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa) return; } - uint64_t calculated_limit = - (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; - spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, + uint64_t msdcount = 0; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + msdcount += e->lse_msdcount; + + uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100; + spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit, zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); } @@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa) } static boolean_t -summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) +summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg) { + if (e->lse_end == txg) + return (0); + if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max, + zfs_max_logsm_summary_length)) + return (1); uint64_t blocks_per_row = MAX(1, DIV_ROUND_UP(spa_log_sm_blocklimit(spa), zfs_max_logsm_summary_length)); @@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) * the metaslab. */ void -spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) +spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty) { /* * We don't track summary data for read-only pools and this function @@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) } target->lse_mscount--; + if (dirty) + target->lse_msdcount--; } /* @@ -490,15 +506,12 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) void spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) { - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + log_summary_entry_t *e = list_head(&spa->spa_log_summary); + ASSERT3P(e, !=, NULL); + if (e->lse_txgcount > 0) + e->lse_txgcount--; + for (; e != NULL; e = list_head(&spa->spa_log_summary)) { if (e->lse_blkcount > blocks_gone) { - /* - * Assert that we stopped at an entry that is not - * obsolete. - */ - ASSERT(e->lse_mscount != 0); - e->lse_blkcount -= blocks_gone; blocks_gone = 0; break; @@ -560,31 +573,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa) static void summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, - uint64_t nblocks) + uint64_t metaslabs_dirty, uint64_t nblocks) { log_summary_entry_t *e = list_tail(&spa->spa_log_summary); - if (e == NULL || summary_entry_is_full(spa, e)) { + if (e == NULL || summary_entry_is_full(spa, e, txg)) { e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); - e->lse_start = txg; + e->lse_start = e->lse_end = txg; + e->lse_txgcount = 1; list_insert_tail(&spa->spa_log_summary, e); } ASSERT3U(e->lse_start, <=, txg); + if (e->lse_end < txg) { + e->lse_end = txg; + e->lse_txgcount++; + } e->lse_mscount += metaslabs_flushed; + e->lse_msdcount += metaslabs_dirty; e->lse_blkcount += nblocks; } static void spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) { - summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); + summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks); +} + +void +spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty) +{ + summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0); } void -spa_log_summary_add_flushed_metaslab(spa_t *spa) +spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg) { - summary_add_data(spa, spa_syncing_txg(spa), 1, 0); + log_summary_entry_t *target = NULL; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_next(&spa->spa_log_summary, e)) { + if (e->lse_start > txg) + break; + target = e; + } + ASSERT3P(target, !=, NULL); + ASSERT3U(target->lse_mscount, !=, 0); + target->lse_msdcount++; } /* @@ -630,6 +664,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) int64_t available_blocks = spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; + int64_t available_txgs = zfs_unflushed_log_txg_max; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + available_txgs -= e->lse_txgcount; + /* * This variable tells us the total number of flushes needed to * keep the log size within the limit when we reach txgs_in_future. @@ -637,9 +676,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) uint64_t total_flushes = 0; /* Holds the current maximum of our estimates so far. */ - uint64_t max_flushes_pertxg = - MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), - zfs_min_metaslabs_to_flush); + uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush; /* * For our estimations we only look as far in the future @@ -653,11 +690,15 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * then keep skipping TXGs accumulating more blocks * based on the incoming rate until we exceed it. */ - if (available_blocks >= 0) { - uint64_t skip_txgs = (available_blocks / incoming) + 1; + if (available_blocks >= 0 && available_txgs >= 0) { + uint64_t skip_txgs = (incoming == 0) ? + available_txgs + 1 : MIN(available_txgs + 1, + (available_blocks / incoming) + 1); available_blocks -= (skip_txgs * incoming); + available_txgs -= skip_txgs; txgs_in_future += skip_txgs; ASSERT3S(available_blocks, >=, -incoming); + ASSERT3S(available_txgs, >=, -1); } /* @@ -666,9 +707,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * based on the current entry in the summary, updating * our available_blocks. */ - ASSERT3S(available_blocks, <, 0); + ASSERT(available_blocks < 0 || available_txgs < 0); available_blocks += e->lse_blkcount; - total_flushes += e->lse_mscount; + available_txgs += e->lse_txgcount; + total_flushes += e->lse_msdcount; /* * Keep the running maximum of the total_flushes that @@ -680,8 +722,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) */ max_flushes_pertxg = MAX(max_flushes_pertxg, DIV_ROUND_UP(total_flushes, txgs_in_future)); - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - max_flushes_pertxg); } return (max_flushes_pertxg); } @@ -743,7 +783,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) * request of flushing everything before we attempt to return * immediately. */ - if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && !spa_flush_all_logs_requested(spa)) return; @@ -771,14 +811,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) uint64_t want_to_flush; if (spa_flush_all_logs_requested(spa)) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); - want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); + want_to_flush = UINT64_MAX; } else { want_to_flush = spa_estimate_metaslabs_to_flush(spa); } - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - want_to_flush); - /* Used purely for verification purposes */ uint64_t visited = 0; @@ -809,31 +846,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) break; - mutex_enter(&curr->ms_sync_lock); - mutex_enter(&curr->ms_lock); - boolean_t flushed = metaslab_flush(curr, tx); - mutex_exit(&curr->ms_lock); - mutex_exit(&curr->ms_sync_lock); - - /* - * If we failed to flush a metaslab (because it was loading), - * then we are done with the block heuristic as it's not - * possible to destroy any log space maps once you've skipped - * a metaslab. In that case we just set our counter to 0 but - * we continue looping in case there is still memory pressure - * due to unflushed changes. Note that, flushing a metaslab - * that is not the oldest flushed in the pool, will never - * destroy any log space maps [see spa_cleanup_old_sm_logs()]. - */ - if (!flushed) { - want_to_flush = 0; - } else if (want_to_flush > 0) { - want_to_flush--; - } + if (metaslab_unflushed_dirty(curr)) { + mutex_enter(&curr->ms_sync_lock); + mutex_enter(&curr->ms_lock); + metaslab_flush(curr, tx); + mutex_exit(&curr->ms_lock); + mutex_exit(&curr->ms_sync_lock); + if (want_to_flush > 0) + want_to_flush--; + } else + metaslab_unflushed_bump(curr, tx, B_FALSE); visited++; } ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); + + spa_log_sm_set_blocklimit(spa); } /* @@ -904,6 +932,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) avl_remove(&spa->spa_sm_logs_by_txg, sls); space_map_free_obj(mos, sls->sls_sm_obj, tx); VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); + spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks); spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; kmem_free(sls, sizeof (spa_log_sm_t)); } @@ -963,12 +992,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); - /* - * If the log space map feature was just enabled, the blocklimit - * has not yet been set. - */ - if (spa_log_sm_blocklimit(spa) == 0) - spa_log_sm_set_blocklimit(spa); + spa_log_sm_set_blocklimit(spa); } /* @@ -1094,12 +1118,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) panic("invalid maptype_t"); break; } + if (!metaslab_unflushed_dirty(ms)) { + metaslab_set_unflushed_dirty(ms, B_TRUE); + spa_log_summary_dirty_flushed_metaslab(spa, + metaslab_unflushed_txg(ms)); + } return (0); } static int spa_ld_log_sm_data(spa_t *spa) { + spa_log_sm_t *sls, *psls; int error = 0; /* @@ -1113,50 +1143,98 @@ spa_ld_log_sm_data(spa_t *spa) ASSERT0(spa->spa_unflushed_stats.sus_memused); hrtime_t read_logs_starttime = gethrtime(); - /* this is a no-op when we don't have space map logs */ - for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); - sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - space_map_t *sm = NULL; - error = space_map_open(&sm, spa_meta_objset(spa), - sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); - if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " - "space_map_open(obj=%llu) [error %d]", - (u_longlong_t)sls->sls_sm_obj, error); - goto out; + + /* Prefetch log spacemaps dnodes. */ + for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj, + ZIO_PRIORITY_SYNC_READ); + } + + uint_t pn = 0; + uint64_t ps = 0; + uint64_t nsm = 0; + psls = sls = avl_first(&spa->spa_sm_logs_by_txg); + while (sls != NULL) { + /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ + if (psls != NULL && pn < 16 && + (pn < 2 || ps < 2 * dmu_prefetch_max)) { + error = space_map_open(&psls->sls_sm, + spa_meta_objset(spa), psls->sls_sm_obj, 0, + UINT64_MAX, SPA_MINBLOCKSHIFT); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data(): " + "failed at space_map_open(obj=%llu) " + "[error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj, + 0, 0, space_map_length(psls->sls_sm), + ZIO_PRIORITY_ASYNC_READ); + pn++; + ps += space_map_length(psls->sls_sm); + psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls); + continue; } + /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ + kpreempt(KPREEMPT_SYNC); + ASSERT0(sls->sls_nblocks); + sls->sls_nblocks = space_map_nblocks(sls->sls_sm); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + summary_add_data(spa, sls->sls_txg, + sls->sls_mscount, 0, sls->sls_nblocks); + + spa_import_progress_set_notes_nolog(spa, + "Read %llu of %lu log space maps", (u_longlong_t)nsm, + avl_numnodes(&spa->spa_sm_logs_by_txg)); + struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg }; - error = space_map_iterate(sm, space_map_length(sm), - spa_ld_log_sm_cb, &vla); + error = space_map_iterate(sls->sls_sm, + space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla); if (error != 0) { - space_map_close(sm); spa_load_failed(spa, "spa_ld_log_sm_data(): failed " "at space_map_iterate(obj=%llu) [error %d]", (u_longlong_t)sls->sls_sm_obj, error); goto out; } - ASSERT0(sls->sls_nblocks); - sls->sls_nblocks = space_map_nblocks(sm); - spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; - summary_add_data(spa, sls->sls_txg, - sls->sls_mscount, sls->sls_nblocks); + pn--; + ps -= space_map_length(sls->sls_sm); + nsm++; + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); - space_map_close(sm); + /* Update log block limits considering just loaded. */ + spa_log_sm_set_blocklimit(spa); } + hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, - "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " - "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), + "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) " + "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg), (u_longlong_t)spa_log_sm_nblocks(spa), (u_longlong_t)zfs_log_sm_blksz, - (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); + (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime)); out: + if (error != 0) { + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + if (sls->sls_sm) { + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + } + } + } else { + ASSERT0(pn); + ASSERT0(ps); + } /* * Now that the metaslabs contain their unflushed changes: * [1] recalculate their actual allocated space @@ -1237,6 +1315,9 @@ spa_ld_unflushed_txgs(vdev_t *vd) } ms->ms_unflushed_txg = entry.msp_unflushed_txg; + ms->ms_unflushed_dirty = B_FALSE; + ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(ms->ms_unflushed_frees)); if (ms->ms_unflushed_txg != 0) { mutex_enter(&spa->spa_flushed_ms_lock); avl_add(&spa->spa_metaslabs_by_flushed, ms); @@ -1283,40 +1364,44 @@ spa_ld_log_spacemaps(spa_t *spa) } /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW, - "Specific hard-limit in memory that ZFS allows to be used for " - "unflushed changes"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW, + "Specific hard-limit in memory that ZFS allows to be used for " + "unflushed changes"); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW, - "Percentage of the overall system memory that ZFS allows to be " - "used for unflushed changes (value is calculated over 1000000 for " - "finer granularity)"); - -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW, - "Hard limit (upper-bound) in the size of the space map log " - "in terms of blocks."); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW, + "Percentage of the overall system memory that ZFS allows to be " + "used for unflushed changes (value is calculated over 1000000 for " + "finer granularity)"); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW, - "Lower-bound limit for the maximum amount of blocks allowed in " - "log spacemap (see zfs_unflushed_log_block_max)"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW, + "Hard limit (upper-bound) in the size of the space map log " + "in terms of blocks."); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW, - "Tunable used to determine the number of blocks that can be used for " - "the spacemap log, expressed as a percentage of the total number of " - "metaslabs in the pool (e.g. 400 means the number of log blocks is " - "capped at 4 times the number of metaslabs)"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW, + "Lower-bound limit for the maximum amount of blocks allowed in " + "log spacemap (see zfs_unflushed_log_block_max)"); -ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW, - "The number of past TXGs that the flushing algorithm of the log " - "spacemap feature uses to estimate incoming log blocks"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW, + "Hard limit (upper-bound) in the size of the space map log " + "in terms of dirty TXGs."); -ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW, - "Maximum number of rows allowed in the summary of the spacemap log"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW, + "Tunable used to determine the number of blocks that can be used for " + "the spacemap log, expressed as a percentage of the total number of " + "metaslabs in the pool (e.g. 400 means the number of log blocks is " + "capped at 4 times the number of metaslabs)"); -ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW, - "Minimum number of metaslabs to flush per dirty TXG"); +ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW, + "The number of past TXGs that the flushing algorithm of the log " + "spacemap feature uses to estimate incoming log blocks"); ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, - "Prevent the log spacemaps from being flushed and destroyed " - "during pool export/destroy"); + "Prevent the log spacemaps from being flushed and destroyed " + "during pool export/destroy"); /* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW, + "Maximum number of rows allowed in the summary of the spacemap log"); + +ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW, + "Minimum number of metaslabs to flush per dirty TXG"); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 1ecd2294dba0..d1d41bbe7214 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,16 +20,18 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #include <sys/zfs_context.h> +#include <sys/zfs_chksum.h> #include <sys/spa_impl.h> #include <sys/zio.h> #include <sys/zio_checksum.h> @@ -56,6 +58,7 @@ #include <sys/fs/zfs.h> #include <sys/metaslab_impl.h> #include <sys/arc.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/kstat.h> #include "zfs_prop.h" @@ -79,7 +82,8 @@ * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices - * - Held for the duration of create/destroy/import/export + * - Held for the duration of create/destroy + * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by @@ -232,17 +236,16 @@ * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ -static avl_tree_t spa_namespace_avl; +avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; -static kcondvar_t spa_namespace_cv; -int spa_max_replication_override = SPA_DVAS_PER_BP; +kcondvar_t spa_namespace_cv; +static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; -kmem_cache_t *spa_buffer_pool; spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG @@ -304,25 +307,25 @@ int zfs_free_leak_on_eio = B_FALSE; * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ -unsigned long zfs_deadman_synctime_ms = 600000UL; +uint64_t zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ -unsigned long zfs_deadman_ziotime_ms = 300000UL; +uint64_t zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ -unsigned long zfs_deadman_checktime_ms = 60000UL; +uint64_t zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ /* * By default the deadman is enabled. */ -int zfs_deadman_enabled = 1; +int zfs_deadman_enabled = B_TRUE; /* * Controls the behavior of the deadman when it detects a "hung" I/O. @@ -332,7 +335,7 @@ int zfs_deadman_enabled = 1; * continue - Attempt to recover from a "hung" I/O * panic - Panic the system */ -char *zfs_deadman_failmode = "wait"; +const char *zfs_deadman_failmode = "wait"; /* * The worst case is single-sector max-parity RAID-Z blocks, in which @@ -343,7 +346,7 @@ char *zfs_deadman_failmode = "wait"; * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ -int spa_asize_inflation = 24; +uint_t spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in @@ -383,11 +386,21 @@ int spa_asize_inflation = 24; * * See also the comments in zfs_space_check_t. */ -int spa_slop_shift = 5; -uint64_t spa_min_slop = 128ULL * 1024 * 1024; -uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; -int spa_allocators = 4; +uint_t spa_slop_shift = 5; +static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; +static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; +/* + * Number of allocators to use, per spa instance + */ +static int spa_num_allocators = 4; +static int spa_cpus_per_allocator = 4; + +/* + * Spa active allocator. + * Valid values are zfs_active_allocator=<dynamic|cursor|new-dynamic>. + */ +const char *zfs_active_allocator = "dynamic"; void spa_load_failed(spa_t *spa, const char *fmt, ...) @@ -415,20 +428,22 @@ spa_load_note(spa_t *spa, const char *fmt, ...) zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); + + spa_import_progress_set_notes_nolog(spa, "%s", buf); } /* * By default dedup and user data indirects land in the special class */ -int zfs_ddt_data_is_special = B_TRUE; -int zfs_user_indirect_is_special = B_TRUE; +static int zfs_ddt_data_is_special = B_TRUE; +static int zfs_user_indirect_is_special = B_TRUE; /* * The percentage of special class final space reserved for metadata only. * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only * let metadata into the class. */ -int zfs_special_class_metadata_reserve_pct = 25; +static uint_t zfs_special_class_metadata_reserve_pct = 25; /* * ========================================================================== @@ -462,7 +477,7 @@ spa_config_lock_destroy(spa_t *spa) } int -spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) +spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; @@ -492,9 +507,11 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) return (1); } -void -spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +static void +spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, + int mmp_flag) { + (void) tag; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); @@ -507,7 +524,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { - while (scl->scl_writer || scl->scl_write_wanted) { + while (scl->scl_writer || + (!mmp_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { @@ -526,8 +544,30 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) } void +spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + spa_config_enter_impl(spa, locks, tag, rw, 0); +} + +/* + * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * outstanding write lock requests. This is needed since the mmp updates are + * time sensitive and failure to service them promptly will result in a + * suspended pool. This pool suspension has been seen in practice when there is + * a single disk in a pool that is responding slowly and presumably about to + * fail. + */ + +void +spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + spa_config_enter_impl(spa, locks, tag, rw, 1); +} + +void spa_config_exit(spa_t *spa, int locks, const void *tag) { + (void) tag; for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) @@ -581,6 +621,7 @@ spa_lookup(const char *name) ASSERT(MUTEX_HELD(&spa_namespace_lock)); +retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* @@ -592,6 +633,20 @@ spa_lookup(const char *name) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); + if (spa == NULL) + return (NULL); + + /* + * Avoid racing with import/export, which don't hold the namespace + * lock for their entire duration. + */ + if ((spa->spa_load_thread != NULL && + spa->spa_load_thread != curthread) || + (spa->spa_export_thread != NULL && + spa->spa_export_thread != curthread)) { + cv_wait(&spa_namespace_cv, &spa_namespace_lock); + goto retry; + } return (spa); } @@ -684,11 +739,13 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); + spa_set_allocator(spa, zfs_active_allocator); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* @@ -697,15 +754,25 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) if (altroot) spa->spa_root = spa_strdup(altroot); - spa->spa_alloc_count = spa_allocators; + /* Do not allow more allocators than fraction of CPUs. */ + spa->spa_alloc_count = MAX(MIN(spa_num_allocators, + boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); + spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } + if (spa->spa_alloc_count > 1) { + spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count]), KM_SLEEP); + mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT, + NULL); + } + avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, @@ -746,6 +813,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; @@ -783,13 +851,11 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); - cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); - while ((dp = list_head(&spa->spa_config_list)) != NULL) { - list_remove(&spa->spa_config_list, dp); + while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); @@ -801,6 +867,11 @@ spa_remove(spa_t *spa) } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); + if (spa->spa_alloc_count > 1) { + mutex_destroy(&spa->spa_allocs_use->sau_lock); + kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, + sau_inuse[spa->spa_alloc_count])); + } avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); @@ -875,22 +946,25 @@ spa_next(spa_t *prev) * have the namespace lock held. */ void -spa_open_ref(spa_t *spa, void *tag) +spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or - * have the namespace lock held. + * have the namespace lock held or be part of a pool import/export. */ void -spa_close(spa_t *spa, void *tag) +spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || - MUTEX_HELD(&spa_namespace_lock)); + MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread || + spa->spa_export_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } @@ -903,20 +977,22 @@ spa_close(spa_t *spa, void *tag) * so the asserts in spa_close() do not apply. */ void -spa_async_close(spa_t *spa, void *tag) +spa_async_close(spa_t *spa, const void *tag) { (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with - * spa_namespace_lock held. We really compare against spa_minref, which is the - * number of references acquired when opening a pool + * spa_namespace_lock held or be the spa export thread. We really + * compare against spa_minref, which is the number of references + * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } @@ -1164,6 +1240,8 @@ spa_vdev_enter(spa_t *spa) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + ASSERT0(spa->spa_export_thread); + vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); @@ -1181,6 +1259,8 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid) mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + ASSERT0(spa->spa_export_thread); + vdev_autotrim_stop_all(spa); if (guid != 0) { @@ -1213,7 +1293,8 @@ spa_vdev_config_enter(spa_t *spa) * of multiple transactions without releasing the spa_namespace_lock. */ void -spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) +spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, + const char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -1287,7 +1368,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) * If the config changed, update the config cache. */ if (config_changed) - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); } /* @@ -1382,7 +1463,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) */ if (config_changed) { mutex_enter(&spa_namespace_lock); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } @@ -1476,8 +1557,7 @@ spa_strdup(const char *s) len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); - bcopy(s, new, len); - new[len] = '\0'; + memcpy(new, s, len + 1); return (new); } @@ -1512,8 +1592,8 @@ void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; - char *checksum = NULL; - char *compress = NULL; + const char *checksum = NULL; + const char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { @@ -1534,7 +1614,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } - SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, + SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum, compress); } @@ -1652,10 +1732,10 @@ spa_altroot(spa_t *spa, char *buf, size_t buflen) if (spa->spa_root == NULL) buf[0] = '\0'; else - (void) strncpy(buf, spa->spa_root, buflen); + (void) strlcpy(buf, spa->spa_root, buflen); } -int +uint32_t spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); @@ -1795,7 +1875,8 @@ spa_get_slop_space(spa_t *spa) * deduplicated data, so since it's not useful to reserve more * space with more deduplicated data, we subtract that out here. */ - space = spa_get_dspace(spa) - spa->spa_dedup_dspace; + space = + spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa); slop = MIN(space >> spa_slop_shift, spa_max_slop); /* @@ -1832,37 +1913,28 @@ void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + - ddt_get_dedup_dspace(spa); - if (spa->spa_vdev_removal != NULL) { + ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); + if (spa->spa_nonallocating_dspace > 0) { /* - * We can't allocate from the removing device, so subtract - * its size if it was included in dspace (i.e. if this is a - * normal-class vdev, not special/dedup). This prevents the - * DMU/DSL from filling up the (now smaller) pool while we - * are in the middle of removing the device. + * Subtract the space provided by all non-allocating vdevs that + * contribute to dspace. If a file is overwritten, its old + * blocks are freed and new blocks are allocated. If there are + * no snapshots of the file, the available space should remain + * the same. The old blocks could be freed from the + * non-allocating vdev, but the new blocks must be allocated on + * other (allocating) vdevs. By reserving the entire size of + * the non-allocating vdevs (including allocated space), we + * ensure that there will be enough space on the allocating + * vdevs for this file overwrite to succeed. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be - * allocated twice (on the old device and the new - * device). - */ - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vdev_t *vd = - vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); - /* - * If the stars align, we can wind up here after - * vdev_remove_complete() has cleared vd->vdev_mg but before - * spa->spa_vdev_removal gets cleared, so we must check before - * we dereference. + * allocated twice (on the old device and the new device). */ - if (vd->vdev_mg && - vd->vdev_mg->mg_class == spa_normal_class(spa)) { - spa->spa_dspace -= spa_deflate(spa) ? - vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; - } - spa_config_exit(spa, SCL_VDEV, FTAG); + ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace); + spa->spa_dspace -= spa->spa_nonallocating_dspace; } } @@ -2153,6 +2225,7 @@ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; + char *spa_load_notes; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; @@ -2163,9 +2236,9 @@ spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { - seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", + seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", - "pool_name"); + "pool_name", "notes"); return (0); } @@ -2174,11 +2247,12 @@ spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; - seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", + seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, - (sip->pool_name ? sip->pool_name : "-")); + (sip->pool_name ? sip->pool_name : "-"), + (sip->spa_load_notes ? sip->spa_load_notes : "-")); return (0); } @@ -2192,6 +2266,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + kmem_strfree(sip->spa_load_notes); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } @@ -2247,6 +2323,10 @@ spa_import_progress_set_state(uint64_t pool_guid, sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } error = 0; break; } @@ -2256,6 +2336,59 @@ spa_import_progress_set_state(uint64_t pool_guid, return (error); } +static void +spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg, + const char *fmt, va_list adx) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + uint64_t pool_guid = spa_guid(spa); + + if (shl->size == 0) + return; + + char *notes = kmem_vasprintf(fmt, adx); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } + sip->spa_load_notes = notes; + if (log_dbgmsg) + zfs_dbgmsg("'%s' %s", sip->pool_name, notes); + notes = NULL; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + if (notes != NULL) + kmem_strfree(notes); +} + +void +spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx); + va_end(adx); +} + +void +spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx); + va_end(adx); +} + int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { @@ -2313,7 +2446,7 @@ spa_import_progress_add(spa_t *spa) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; - char *poolname = NULL; + const char *poolname = NULL; sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); sip->pool_guid = spa_guid(spa); @@ -2324,6 +2457,7 @@ spa_import_progress_add(spa_t *spa) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); + sip->spa_load_notes = NULL; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); @@ -2343,6 +2477,8 @@ spa_import_progress_remove(uint64_t pool_guid) if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + spa_strfree(sip->spa_load_notes); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); @@ -2417,18 +2553,20 @@ spa_init(spa_mode_t mode) unique_init(); zfs_btree_init(); metaslab_stat_init(); + brt_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); - vdev_cache_stat_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); + chksum_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); + vdev_prop_init(); l2arc_start(); scan_init(); qat_init(); @@ -2443,13 +2581,14 @@ spa_fini(void) spa_evict_all(); vdev_file_fini(); - vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); + chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); + brt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); @@ -2557,10 +2696,18 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; + + if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) + spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; + else + spa->spa_scan_pass_errorscrub_pause = 0; + spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; - vdev_scan_stat_init(spa->spa_root_vdev); + + // error scrub stats + spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* @@ -2571,9 +2718,11 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; - if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && + scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); - bzero(ps, sizeof (pool_scan_stat_t)); + + memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; @@ -2582,7 +2731,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; - ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_skipped = scn->scn_phys.scn_skipped; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; @@ -2595,6 +2744,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; + /* error scrub data stored on disk */ + ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; + ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; + ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; + ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; + ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; + ps->pss_error_scrub_to_be_examined = + scn->errorscrub_phys.dep_to_examine; + + /* error scrub data not stored on disk */ + ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; + return (0); } @@ -2714,8 +2875,7 @@ spa_state_to_name(spa_t *spa) vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; - if (spa_suspended(spa) && - (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) + if (spa_suspended(spa)) return ("SUSPENDED"); switch (state) { @@ -2926,13 +3086,13 @@ ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); -ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW, "Dead I/O check interval in milliseconds"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, "Enable deadman timer"); -ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW, "SPA size estimate multiplication factor"); ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, @@ -2947,17 +3107,23 @@ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, - param_set_deadman_synctime, param_get_ulong, ZMOD_RW, + param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, - param_set_deadman_ziotime, param_get_ulong, ZMOD_RW, + param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW, "IO expiration time in milliseconds"); -ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); /* END CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, - param_get_int, ZMOD_RW, "Reserved free space in pool"); + param_get_uint, ZMOD_RW, "Reserved free space in pool"); + +ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, + "Number of allocators per spa"); + +ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, + "Minimum number of CPUs per allocators"); diff --git a/sys/contrib/openzfs/module/zfs/spa_stats.c b/sys/contrib/openzfs/module/zfs/spa_stats.c index 534ac72fee7b..17ed2a620b1e 100644 --- a/sys/contrib/openzfs/module/zfs/spa_stats.c +++ b/sys/contrib/openzfs/module/zfs/spa_stats.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -28,22 +28,22 @@ /* * Keeps stats on last N reads per spa_t, disabled by default. */ -int zfs_read_history = 0; +static uint_t zfs_read_history = B_FALSE; /* * Include cache hits in history, disabled by default. */ -int zfs_read_history_hits = 0; +static int zfs_read_history_hits = B_FALSE; /* * Keeps stats on the last 100 txgs by default. */ -int zfs_txg_history = 100; +static uint_t zfs_txg_history = 100; /* * Keeps stats on the last N MMP updates, disabled by default. */ -int zfs_multihost_history = 0; +static uint_t zfs_multihost_history = B_FALSE; /* * ========================================================================== @@ -819,6 +819,41 @@ spa_state_init(spa_t *spa) kmem_strfree(name); } +static int +spa_guid_data(char *buf, size_t size, void *data) +{ + spa_t *spa = (spa_t *)data; + (void) snprintf(buf, size, "%llu\n", (u_longlong_t)spa_guid(spa)); + return (0); +} + +static void +spa_guid_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.guid; + char *name; + kstat_t *ksp; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + + ksp = kstat_create(name, 0, "guid", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; + kstat_set_raw_ops(ksp, NULL, spa_guid_data, spa_state_addr); + kstat_install(ksp); + } + + kmem_strfree(name); +} + static void spa_health_destroy(spa_t *spa) { @@ -830,7 +865,18 @@ spa_health_destroy(spa_t *spa) mutex_destroy(&shk->lock); } -static spa_iostats_t spa_iostats_template = { +static void +spa_guid_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.guid; + kstat_t *ksp = shk->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&shk->lock); +} + +static const spa_iostats_t spa_iostats_template = { { "trim_extents_written", KSTAT_DATA_UINT64 }, { "trim_bytes_written", KSTAT_DATA_UINT64 }, { "trim_extents_skipped", KSTAT_DATA_UINT64 }, @@ -950,6 +996,7 @@ spa_stats_init(spa_t *spa) spa_tx_assign_init(spa); spa_mmp_history_init(spa); spa_state_init(spa); + spa_guid_init(spa); spa_iostats_init(spa); } @@ -962,18 +1009,17 @@ spa_stats_destroy(spa_t *spa) spa_txg_history_destroy(spa); spa_read_history_destroy(spa); spa_mmp_history_destroy(spa); + spa_guid_destroy(spa); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW, - "Historical statistics for the last N reads"); +ZFS_MODULE_PARAM(zfs, zfs_, read_history, UINT, ZMOD_RW, + "Historical statistics for the last N reads"); ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW, - "Include cache hits in read history"); + "Include cache hits in read history"); -ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW, - "Historical statistics for the last N txgs"); +ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, UINT, ZMOD_RW, + "Historical statistics for the last N txgs"); -ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW, - "Historical statistics for last N multihost writes"); -/* END CSTYLED */ +ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, UINT, ZMOD_RW, + "Historical statistics for last N multihost writes"); diff --git a/sys/contrib/openzfs/module/zfs/space_map.c b/sys/contrib/openzfs/module/zfs/space_map.c index 11d4798925e4..a336ff41eadb 100644 --- a/sys/contrib/openzfs/module/zfs/space_map.c +++ b/sys/contrib/openzfs/module/zfs/space_map.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -455,7 +455,8 @@ space_map_histogram_clear(space_map_t *sm) if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) return; - bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); + memset(sm->sm_phys->smp_histogram, 0, + sizeof (sm->sm_phys->smp_histogram)); } boolean_t @@ -548,7 +549,7 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) static void space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, - void *tag, dmu_tx_t *tx) + const void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); @@ -896,7 +897,7 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) * will be reset. Do the same in the common case so that * bugs related to the uncommon case do not go unnoticed. */ - bzero(sm->sm_phys->smp_histogram, + memset(sm->sm_phys->smp_histogram, 0, sizeof (sm->sm_phys->smp_histogram)); } diff --git a/sys/contrib/openzfs/module/zfs/space_reftree.c b/sys/contrib/openzfs/module/zfs/space_reftree.c index 080fc6646512..ee11e162dd5b 100644 --- a/sys/contrib/openzfs/module/zfs/space_reftree.c +++ b/sys/contrib/openzfs/module/zfs/space_reftree.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c index c9eb84bbdb12..5ce6be69be14 100644 --- a/sys/contrib/openzfs/module/zfs/txg.c +++ b/sys/contrib/openzfs/module/zfs/txg.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -108,10 +108,10 @@ * now transition to the syncing state. */ -static void txg_sync_thread(void *arg); -static void txg_quiesce_thread(void *arg); +static __attribute__((noreturn)) void txg_sync_thread(void *arg); +static __attribute__((noreturn)) void txg_quiesce_thread(void *arg); -int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ +uint_t zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ /* * Prepare the txg subsystem. @@ -121,7 +121,7 @@ txg_init(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; int c; - bzero(tx, sizeof (tx_state_t)); + memset(tx, 0, sizeof (tx_state_t)); tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); @@ -186,7 +186,7 @@ txg_fini(dsl_pool_t *dp) vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); - bzero(tx, sizeof (tx_state_t)); + memset(tx, 0, sizeof (tx_state_t)); } /* @@ -429,7 +429,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) } static void -txg_do_callbacks(list_t *cb_list) +txg_do_callbacks(void *cb_list) { dmu_tx_do_callbacks(cb_list, 0); @@ -479,7 +479,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) list_move_tail(cb_list, &tc->tc_callbacks[g]); - (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) + (void) taskq_dispatch(tx->tx_commit_cb_taskq, txg_do_callbacks, cb_list, TQ_SLEEP); } } @@ -514,7 +514,7 @@ txg_has_quiesced_to_sync(dsl_pool_t *dp) return (tx->tx_quiesced_txg != 0); } -static void +static __attribute__((noreturn)) void txg_sync_thread(void *arg) { dsl_pool_t *dp = arg; @@ -551,6 +551,15 @@ txg_sync_thread(void *arg) } /* + * When we're suspended, nothing should be changing and for + * MMP we don't want to bump anything that would make it + * harder to detect if another host is changing it when + * resuming after a MMP suspend. + */ + if (spa_suspended(spa)) + continue; + + /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ @@ -605,7 +614,7 @@ txg_sync_thread(void *arg) } } -static void +static __attribute__((noreturn)) void txg_quiesce_thread(void *arg) { dsl_pool_t *dp = arg; @@ -895,15 +904,10 @@ txg_list_destroy(txg_list_t *tl) boolean_t txg_all_lists_empty(txg_list_t *tl) { - mutex_enter(&tl->tl_lock); - for (int i = 0; i < TXG_SIZE; i++) { - if (!txg_list_empty_impl(tl, i)) { - mutex_exit(&tl->tl_lock); - return (B_FALSE); - } - } - mutex_exit(&tl->tl_lock); - return (B_TRUE); + boolean_t res = B_TRUE; + for (int i = 0; i < TXG_SIZE; i++) + res &= (tl->tl_head[i] == NULL); + return (res); } /* @@ -1069,7 +1073,5 @@ EXPORT_SYMBOL(txg_wait_callbacks); EXPORT_SYMBOL(txg_stalled); EXPORT_SYMBOL(txg_sync_waiting); -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, UINT, ZMOD_RW, "Max seconds worth of delta per txg"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/uberblock.c b/sys/contrib/openzfs/module/zfs/uberblock.c index b8857d74d810..22ee8036c473 100644 --- a/sys/contrib/openzfs/module/zfs/uberblock.c +++ b/sys/contrib/openzfs/module/zfs/uberblock.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -70,5 +70,5 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay) } ub->ub_checkpoint_txg = 0; - return (ub->ub_rootbp.blk_birth == txg); + return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg); } diff --git a/sys/contrib/openzfs/module/zfs/unique.c b/sys/contrib/openzfs/module/zfs/unique.c index 0e076797a002..799e4095db33 100644 --- a/sys/contrib/openzfs/module/zfs/unique.c +++ b/sys/contrib/openzfs/module/zfs/unique.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 4a67ba85f58a..c74f72159dc9 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -28,7 +28,8 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include <sys/zfs_context.h> @@ -57,8 +58,10 @@ #include <sys/abd.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> +#include <sys/vdev_raidz.h> #include <sys/zvol.h> #include <sys/zfs_ratelimit.h> +#include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are @@ -79,22 +82,22 @@ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab. */ -int zfs_embedded_slog_min_ms = 64; +static uint_t zfs_embedded_slog_min_ms = 64; /* default target for number of metaslabs per top-level vdev */ -int zfs_vdev_default_ms_count = 200; +static uint_t zfs_vdev_default_ms_count = 200; /* minimum number of metaslabs per top-level vdev */ -int zfs_vdev_min_ms_count = 16; +static uint_t zfs_vdev_min_ms_count = 16; /* practical upper limit of total metaslabs per top-level vdev */ -int zfs_vdev_ms_count_limit = 1ULL << 17; +static uint_t zfs_vdev_ms_count_limit = 1ULL << 17; /* lower limit for metaslab size (512M) */ -int zfs_vdev_default_ms_shift = 29; +static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ -int zfs_vdev_max_ms_shift = 34; +static uint_t zfs_vdev_max_ms_shift = 34; int vdev_validate_skip = B_FALSE; @@ -107,18 +110,23 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12); /* * Rate limit slow IO (delay) events to this many per second. */ -unsigned int zfs_slow_io_events_per_second = 20; +static unsigned int zfs_slow_io_events_per_second = 20; + +/* + * Rate limit deadman "hung IO" events to this many per second. + */ +static unsigned int zfs_deadman_events_per_second = 1; /* * Rate limit checksum events after this many checksum errors per second. */ -unsigned int zfs_checksum_events_per_second = 20; +static unsigned int zfs_checksum_events_per_second = 20; /* * Ignore errors during scrub/resilver. Allows to work around resilver * upon import when there are pool errors. */ -int zfs_scan_ignore_errors = 0; +static int zfs_scan_ignore_errors = 0; /* * vdev-wide space maps that have lots of entries written to them at @@ -134,8 +142,16 @@ int zfs_vdev_standard_sm_blksz = (1 << 17); */ int zfs_nocacheflush = 0; -uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; -uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; +/* + * Maximum and minimum ashift values that can be automatically set based on + * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX + * is higher than the maximum value, it is intentionally limited here to not + * excessively impact pool space efficiency. Higher ashift values may still + * be forced by vdev logical ashift or by user via ashift property, but won't + * be set automatically as a performance optimization. + */ +uint_t zfs_vdev_max_auto_ashift = 14; +uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) @@ -214,7 +230,7 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) * Virtual device management. */ -static vdev_ops_t *vdev_ops_table[] = { +static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, @@ -236,7 +252,7 @@ static vdev_ops_t *vdev_ops_table[] = { static vdev_ops_t * vdev_getops(const char *type) { - vdev_ops_t *ops, **opspp; + vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) @@ -261,11 +277,12 @@ vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) return (vd->vdev_mg); } -/* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { + (void) vd, (void) remain_rs; + physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } @@ -294,13 +311,13 @@ vdev_derive_alloc_bias(const char *bias) * all children. This is what's used by anything other than RAID-Z. */ uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) +vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } @@ -336,7 +353,8 @@ vdev_get_min_asize(vdev_t *vd) * to the nearest metaslab. */ if (vd == vd->vdev_top) - return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); + return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, + uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } @@ -378,6 +396,33 @@ vdev_get_nparity(vdev_t *vd) return (nparity); } +static int +vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + uint64_t objid; + int err; + + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { + objid = vd->vdev_top_zap; + } else if (vd->vdev_leaf_zap != 0) { + objid = vd->vdev_leaf_zap; + } else { + return (EINVAL); + } + + err = zap_lookup(mos, objid, vdev_prop_to_name(prop), + sizeof (uint64_t), 1, value); + + if (err == ENOENT) + *value = vdev_prop_default_numeric(prop); + + return (err); +} + /* * Get the number of data disks for a top-level vdev. */ @@ -472,7 +517,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { - bcopy(pvd->vdev_child, newchild, oldsize); + memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } @@ -626,11 +671,21 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); - zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, + zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); + /* + * Default Thresholds for tuning ZED + */ + vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); + vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); + vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); + vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); + vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); + vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); + list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); @@ -652,6 +707,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); @@ -668,7 +724,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); - vdev_cache_init(vd); return (vd); } @@ -683,11 +738,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; - char *type; + const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; - char *tmp = NULL; + const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); @@ -742,7 +797,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { - char *bias; + const char *bias; /* * If creating a top-level vdev, check for allocation @@ -788,8 +843,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) - vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) + vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a @@ -803,18 +858,17 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) - vd->vdev_devid = spa_strdup(vd->vdev_devid); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, - &vd->vdev_physpath) == 0) - vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) + vd->vdev_devid = spa_strdup(tmp); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) + vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, - &vd->vdev_enc_sysfs_path) == 0) - vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); + &tmp) == 0) + vd->vdev_enc_sysfs_path = spa_strdup(tmp); - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) - vd->vdev_fru = spa_strdup(vd->vdev_fru); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) + vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value @@ -844,9 +898,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_not_present); /* - * Get the alignment requirement. + * Get the alignment requirement. Ignore pool ashift for vdev + * attach case. */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + if (alloctype != VDEV_ALLOC_ATTACH) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, + &vd->vdev_ashift); + } else { + vd->vdev_attaching = B_TRUE; + } /* * Retrieve the vdev creation time. @@ -854,6 +914,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); + if (vd->vdev_ops == &vdev_root_ops && + (alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_SPLIT || + alloctype == VDEV_ALLOC_ROOTPOOL)) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, + &vd->vdev_root_zap); + } + /* * If we're a top-level vdev, try to load the allocation parameters. */ @@ -865,10 +933,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, + &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); + vd->vdev_rz_expanding = nvlist_exists(nv, + ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } @@ -943,7 +1015,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { - char *aux; + const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; @@ -1040,7 +1112,6 @@ vdev_free(vdev_t *vd) * Clean up vdev structure. */ vdev_queue_fini(vd); - vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); @@ -1103,6 +1174,7 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); + cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); @@ -1131,7 +1203,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT(tvd == tvd->vdev_top); - tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; @@ -1183,8 +1254,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); + ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); + tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; @@ -1200,6 +1273,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; + svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; @@ -1335,6 +1409,36 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } +/* + * Choose GCD for spa_gcd_alloc. + */ +static uint64_t +vdev_gcd(uint64_t a, uint64_t b) +{ + while (b != 0) { + uint64_t t = b; + b = a % b; + a = t; + } + return (a); +} + +/* + * Set spa_min_alloc and spa_gcd_alloc. + */ +static void +vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) +{ + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + if (spa->spa_gcd_alloc == INT_MAX) { + spa->spa_gcd_alloc = min_alloc; + } else { + spa->spa_gcd_alloc = vdev_gcd(min_alloc, + spa->spa_gcd_alloc); + } +} + void vdev_metaslab_group_create(vdev_t *vd) { @@ -1387,8 +1491,7 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } } } @@ -1418,7 +1521,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { - bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); + memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } @@ -1498,11 +1601,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* - * If the vdev is being removed we don't activate - * the metaslabs since we want to ensure that no new - * allocations are performed on this device. + * If the vdev is marked as non-allocating then don't + * activate the metaslabs since we want to ensure that + * no allocations are performed on this device. */ - if (!expanding && !vd->vdev_removing) { + if (vd->vdev_noalloc) { + /* track non-allocating vdev space */ + spa->spa_nonallocating_dspace += spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); @@ -1511,13 +1618,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); - /* - * Regardless whether this vdev was just added or it is being - * expanded, the metaslab count has changed. Recalculate the - * block limit. - */ - spa_log_sm_set_blocklimit(spa); - return (0); } @@ -1565,12 +1665,12 @@ vdev_metaslab_fini(vdev_t *vd) } } ASSERT0(vd->vdev_ms_count); - ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; + boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; @@ -1604,6 +1704,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; + vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", + vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { @@ -1614,6 +1716,17 @@ vdev_probe_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); + + /* + * If this probe was initiated from zio pipeline, then + * change the state in a spa_async_request. Probes that + * were initiated from a vdev_open can change the state + * as part of the open call. + */ + if (vps->vps_zio_done_probe) { + vd->vdev_fault_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); + } } mutex_enter(&vd->vdev_probe_lock); @@ -1663,8 +1776,8 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | - ZIO_FLAG_TRYHARD; + ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -1691,15 +1804,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } } if (zio != NULL) @@ -1767,6 +1871,7 @@ vdev_uses_zvols(vdev_t *vd) static boolean_t vdev_default_open_children_func(vdev_t *vd) { + (void) vd; return (B_TRUE); } @@ -1825,21 +1930,42 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) } /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. + * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) + * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE + * changed, this algorithm can not change, otherwise it would inconsistently + * account for existing bp's. We also hard-code txg 0 for the same reason + * since expanded RAIDZ vdevs can use a different asize for different birth + * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> + SPA_MINBLOCKSHIFT); } } /* + * Choose the best of two ashifts, preferring one between logical ashift + * (absolute minimum) and administrator defined maximum, otherwise take + * the biggest of the two. + */ +uint64_t +vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) +{ + if (a > logical && a <= zfs_vdev_max_auto_ashift) { + if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (a); + else + return (MAX(a, b)); + } else if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (MAX(a, b)); + return (b); +} + +/* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the @@ -1850,7 +1976,8 @@ vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); - if (vd->vdev_ashift < vd->vdev_physical_ashift) { + if (vd->vdev_ashift < vd->vdev_physical_ashift && + vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, @@ -1915,6 +2042,14 @@ vdev_open(vdev_t *vd) error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); + + /* Keep the device in removed state if unplugged */ + if (error == ENOENT && vd->vdev_removed) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, + VDEV_AUX_NONE); + return (error); + } + /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy @@ -1986,8 +2121,8 @@ vdev_open(vdev_t *vd) } } - osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); - max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); + osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); + max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { @@ -2062,9 +2197,9 @@ vdev_open(vdev_t *vd) return (SET_ERROR(EDOM)); } - if (vd->vdev_top == vd) { + if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); - } + vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { @@ -2125,8 +2260,7 @@ vdev_open(vdev_t *vd) if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } /* @@ -2371,22 +2505,36 @@ vdev_validate(vdev_t *vd) } static void -vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) +vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) { - char *old, *new; - if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { - if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { - zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " - "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, - dvd->vdev_path, svd->vdev_path); - spa_strfree(dvd->vdev_path); - dvd->vdev_path = spa_strdup(svd->vdev_path); + if (svd != NULL && *dvd != NULL) { + if (strcmp(svd, *dvd) != 0) { + zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " + "from '%s' to '%s'", (u_longlong_t)guid, prefix, + *dvd, svd); + spa_strfree(*dvd); + *dvd = spa_strdup(svd); } - } else if (svd->vdev_path != NULL) { - dvd->vdev_path = spa_strdup(svd->vdev_path); + } else if (svd != NULL) { + *dvd = spa_strdup(svd); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", - (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); + (u_longlong_t)guid, *dvd); } +} + +static void +vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) +{ + char *old, *new; + + vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, + dvd->vdev_guid); + + vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, + dvd->vdev_guid); + + vdev_update_path("vdev_physpath", svd->vdev_physpath, + &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports @@ -2527,8 +2675,6 @@ vdev_close(vdev_t *vd) vd->vdev_ops->vdev_op_close(vd); - vdev_cache_purge(vd); - /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that @@ -2615,6 +2761,17 @@ vdev_reopen(vdev_t *vd) } /* + * Recheck if resilver is still needed and cancel any + * scheduled resilver if resilver is unneeded. + */ + if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && + spa->spa_async_tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; + mutex_exit(&spa->spa_async_lock); + } + + /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); @@ -2848,6 +3005,8 @@ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { + (void) dva, (void) psize; + /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); @@ -3103,32 +3262,71 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; + } else { + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) { + /* leaf vdevs only */ + continue; + } + if (t == DTL_PARTIAL) { + /* i.e. non-zero */ + minref = 1; + } else if (vdev_get_nparity(vd) != 0) { + /* RAIDZ, DRAID */ + minref = vdev_get_nparity(vd) + 1; + } else { + /* any kind of mirror */ + minref = vd->vdev_children; + } + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, + vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); } - mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { - /* account for child's outage in parent's missing map */ - int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; - if (t == DTL_SCRUB) - continue; /* leaf vdevs only */ - if (t == DTL_PARTIAL) - minref = 1; /* i.e. non-zero */ - else if (vdev_get_nparity(vd) != 0) - minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ - else - minref = vd->vdev_children; /* any kind of mirror */ - space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); - } - space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); - space_reftree_destroy(&reftree); + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { + raidz_dtl_reassessed(vd); } - mutex_exit(&vd->vdev_dtl_lock); +} + +/* + * Iterate over all the vdevs except spare, and post kobj events + */ +void +vdev_post_kobj_evt(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_kobj_evt_post && + vd->vdev_kobj_flag == B_FALSE) { + vd->vdev_kobj_flag = B_TRUE; + vd->vdev_ops->vdev_op_kobj_evt_post(vd); + } + + for (int c = 0; c < vd->vdev_children; c++) + vdev_post_kobj_evt(vd->vdev_child[c]); +} + +/* + * Iterate over all the vdevs except spare, and clear kobj events + */ +void +vdev_clear_kobj_evt(vdev_t *vd) +{ + vd->vdev_kobj_flag = B_FALSE; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_clear_kobj_evt(vd->vdev_child[c]); } int @@ -3242,6 +3440,12 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) vdev_zap_allocation_data(vd, tx); } } + if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && + spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { + if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) + spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); + vd->vdev_root_zap = vdev_create_link_zap(vd, tx); + } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); @@ -3469,6 +3673,12 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + if (vd->vdev_ops == &vdev_raidz_ops) { + error = vdev_raidz_load(vd); + if (error != 0) + return (error); + } + /* * On spa_load path, grab the allocation bias from our zap */ @@ -3492,6 +3702,26 @@ vdev_load(vdev_t *vd) } } + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + uint64_t failfast; + + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), + 1, &failfast); + if (error == 0) { + vd->vdev_failfast = failfast & 1; + } else if (error == ENOENT) { + vd->vdev_failfast = vdev_prop_default_numeric( + VDEV_PROP_FAILFAST); + } else { + vdev_dbgmsg(vd, + "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", + (u_longlong_t)vd->vdev_top_zap, error); + } + } + /* * Load any rebuild state from the top-level vdev zap. */ @@ -3506,6 +3736,51 @@ vdev_load(vdev_t *vd) } } + if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { + uint64_t zapobj; + + if (vd->vdev_top_zap != 0) + zapobj = vd->vdev_top_zap; + else + zapobj = vd->vdev_leaf_zap; + + error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, + &vd->vdev_checksum_n); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, + &vd->vdev_checksum_t); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, + &vd->vdev_io_n); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, + &vd->vdev_io_t); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, + &vd->vdev_slow_io_n); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, + &vd->vdev_slow_io_t); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + } + /* * If this is a top-level vdev, initialize its metaslabs. */ @@ -3793,10 +4068,22 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +/* + * Return the amount of space that should be (or was) allocated for the given + * psize (compressed block size) in the given TXG. Note that for expanded + * RAIDZ vdevs, the size allocated for older BP's may be larger. See + * vdev_raidz_asize(). + */ +uint64_t +vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); +} + uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { - return (vd->vdev_ops->vdev_op_asize(vd, psize)); + return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* @@ -3912,6 +4199,36 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) return (spa_vdev_state_exit(spa, vd, 0)); } +int +vdev_remove_wanted(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); + + /* + * If the vdev is already removed, or expanding which can trigger + * repartition add/remove events, then don't do anything. + */ + if (vd->vdev_removed || vd->vdev_expanding) + return (spa_vdev_state_exit(spa, NULL, 0)); + + /* + * Confirm the vdev has been removed, otherwise don't do anything. + */ + if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_REMOVE); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + + /* * Online the given vdev. * @@ -3932,9 +4249,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); - wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; @@ -3973,6 +4287,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } @@ -4002,9 +4317,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && - vd->vdev_state >= VDEV_STATE_DEGRADED)) + vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); + /* + * Asynchronously detach spare vdev if resilver or + * rebuild is not required + */ + if (vd->vdev_unspare && + !dsl_scan_resilvering(spa->spa_dsl_pool) && + !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && + !vdev_rebuild_active(tvd)) + spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); + } return (spa_vdev_state_exit(spa, vd, 0)); } @@ -4155,9 +4480,9 @@ vdev_clear(spa_t *spa, vdev_t *vd) vdev_clear(spa, vd->vdev_child[c]); /* - * It makes no sense to "clear" an indirect vdev. + * It makes no sense to "clear" an indirect or removed vdev. */ - if (!vdev_is_concrete(vd)) + if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* @@ -4297,6 +4622,8 @@ vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { + (void) cvd; + int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) @@ -4386,11 +4713,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); - for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { - vsx->vsx_active_queue[t] = - vd->vdev_queue.vq_class[t].vqc_active; - vsx->vsx_pend_queue[t] = avl_numnodes( - &vd->vdev_queue.vq_class[t].vqc_queued_tree); + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; + vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } @@ -4401,12 +4726,13 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { - bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { + vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* @@ -4444,15 +4770,18 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { - vs->vs_esize = P2ALIGN( + vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; - vs->vs_physical_ashift = vd->vdev_physical_ashift; + if (vd->vdev_physical_ashift <= ASHIFT_MAX) + vs->vs_physical_ashift = vd->vdev_physical_ashift; + else + vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, @@ -4469,6 +4798,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } + vs->vs_noalloc = MAX(vd->vdev_noalloc, + tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); @@ -4512,8 +4843,14 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; +/* Suppress ASAN false positive */ +#ifdef __SANITIZE_ADDRESS__ + vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; + vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; +#else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; +#endif zio_type_t type = zio->io_type; int flags = zio->io_flags; @@ -4597,11 +4934,11 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * TRIM ops and bytes are reported to user space as - * ZIO_TYPE_IOCTL. This is done to preserve the + * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) - vs_type = ZIO_TYPE_IOCTL; + vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' @@ -4730,6 +5067,7 @@ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { + (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; @@ -5191,7 +5529,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + if ((vd->vdev_spa->spa_raidz_expand == NULL || + vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && + (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); @@ -5207,9 +5547,13 @@ vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; + VERIFY3U(pvd->vdev_children, >, 1); + vdev_remove_child(pvd, vd); vdev_compact_children(pvd); + ASSERT3P(pvd->vdev_child, !=, NULL); + cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); @@ -5219,7 +5563,7 @@ vdev_split(vdev_t *vd) } void -vdev_deadman(vdev_t *vd, char *tag) +vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; @@ -5231,20 +5575,20 @@ vdev_deadman(vdev_t *vd, char *tag) vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); - if (avl_numnodes(&vq->vq_active_tree) > 0) { + if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; - zfs_dbgmsg("slow vdev: %s has %lu active IOs", - vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); + zfs_dbgmsg("slow vdev: %s has %u active IOs", + vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ - fio = avl_first(&vq->vq_active_tree); + fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); @@ -5375,6 +5719,23 @@ vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, } } +static char * +vdev_name(vdev_t *vd, char *buf, int buflen) +{ + if (vd->vdev_path == NULL) { + if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { + strlcpy(buf, vd->vdev_spa->spa_name, buflen); + } else if (!vd->vdev_ops->vdev_op_leaf) { + snprintf(buf, buflen, "%s-%llu", + vd->vdev_ops->vdev_op_type, + (u_longlong_t)vd->vdev_id); + } + } else { + strlcpy(buf, vd->vdev_path, buflen); + } + return (buf); +} + /* * Look at the vdev tree and determine whether any devices are currently being * replaced. @@ -5404,31 +5765,730 @@ vdev_replace_in_progress(vdev_t *vdev) return (B_FALSE); } +/* + * Add a (source=src, propname=propval) list to an nvlist. + */ +static void +vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, + uint64_t intval, zprop_source_t src) +{ + nvlist_t *propval; + + propval = fnvlist_alloc(); + fnvlist_add_uint64(propval, ZPROP_SOURCE, src); + + if (strval != NULL) + fnvlist_add_string(propval, ZPROP_VALUE, strval); + else + fnvlist_add_uint64(propval, ZPROP_VALUE, intval); + + fnvlist_add_nvlist(nvl, propname, propval); + nvlist_free(propval); +} + +static void +vdev_props_set_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *vd; + nvlist_t *nvp = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = spa->spa_meta_objset; + nvpair_t *elem = NULL; + uint64_t vdev_guid; + uint64_t objid; + nvlist_t *nvprops; + + vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); + nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); + vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); + + /* this vdev could get removed while waiting for this sync task */ + if (vd == NULL) + return; + + /* + * Set vdev property values in the vdev props mos object. + */ + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { + objid = vd->vdev_top_zap; + } else if (vd->vdev_leaf_zap != 0) { + objid = vd->vdev_leaf_zap; + } else { + panic("unexpected vdev type"); + } + + mutex_enter(&spa->spa_props_lock); + + while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { + uint64_t intval; + const char *strval; + vdev_prop_t prop; + const char *propname = nvpair_name(elem); + zprop_type_t proptype; + + switch (prop = vdev_name_to_prop(propname)) { + case VDEV_PROP_USERPROP: + if (vdev_prop_user(propname)) { + strval = fnvpair_value_string(elem); + if (strlen(strval) == 0) { + /* remove the property if value == "" */ + (void) zap_remove(mos, objid, propname, + tx); + } else { + VERIFY0(zap_update(mos, objid, propname, + 1, strlen(strval) + 1, strval, tx)); + } + spa_history_log_internal(spa, "vdev set", tx, + "vdev_guid=%llu: %s=%s", + (u_longlong_t)vdev_guid, nvpair_name(elem), + strval); + } + break; + default: + /* normalize the property name */ + propname = vdev_prop_to_name(prop); + proptype = vdev_prop_get_type(prop); + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + ASSERT(proptype == PROP_TYPE_STRING); + strval = fnvpair_value_string(elem); + VERIFY0(zap_update(mos, objid, propname, + 1, strlen(strval) + 1, strval, tx)); + spa_history_log_internal(spa, "vdev set", tx, + "vdev_guid=%llu: %s=%s", + (u_longlong_t)vdev_guid, nvpair_name(elem), + strval); + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + intval = fnvpair_value_uint64(elem); + + if (proptype == PROP_TYPE_INDEX) { + const char *unused; + VERIFY0(vdev_prop_index_to_string( + prop, intval, &unused)); + } + VERIFY0(zap_update(mos, objid, propname, + sizeof (uint64_t), 1, &intval, tx)); + spa_history_log_internal(spa, "vdev set", tx, + "vdev_guid=%llu: %s=%lld", + (u_longlong_t)vdev_guid, + nvpair_name(elem), (longlong_t)intval); + } else { + panic("invalid vdev property type %u", + nvpair_type(elem)); + } + } + + } + + mutex_exit(&spa->spa_props_lock); +} + +int +vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa = vd->vdev_spa; + nvpair_t *elem = NULL; + uint64_t vdev_guid; + nvlist_t *nvprops; + int error = 0; + + ASSERT(vd != NULL); + + /* Check that vdev has a zap we can use */ + if (vd->vdev_root_zap == 0 && + vd->vdev_top_zap == 0 && + vd->vdev_leaf_zap == 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, + &vdev_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, + &nvprops) != 0) + return (SET_ERROR(EINVAL)); + + if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) + return (SET_ERROR(EINVAL)); + + while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { + const char *propname = nvpair_name(elem); + vdev_prop_t prop = vdev_name_to_prop(propname); + uint64_t intval = 0; + const char *strval = NULL; + + if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { + error = EINVAL; + goto end; + } + + if (vdev_prop_readonly(prop)) { + error = EROFS; + goto end; + } + + /* Special Processing */ + switch (prop) { + case VDEV_PROP_PATH: + if (vd->vdev_path == NULL) { + error = EROFS; + break; + } + if (nvpair_value_string(elem, &strval) != 0) { + error = EINVAL; + break; + } + /* New path must start with /dev/ */ + if (strncmp(strval, "/dev/", 5)) { + error = EINVAL; + break; + } + error = spa_vdev_setpath(spa, vdev_guid, strval); + break; + case VDEV_PROP_ALLOCATING: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (intval != vd->vdev_noalloc) + break; + if (intval == 0) + error = spa_vdev_noalloc(spa, vdev_guid); + else + error = spa_vdev_alloc(spa, vdev_guid); + break; + case VDEV_PROP_FAILFAST: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_failfast = intval & 1; + break; + case VDEV_PROP_CHECKSUM_N: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_checksum_n = intval; + break; + case VDEV_PROP_CHECKSUM_T: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_checksum_t = intval; + break; + case VDEV_PROP_IO_N: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_io_n = intval; + break; + case VDEV_PROP_IO_T: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_io_t = intval; + break; + case VDEV_PROP_SLOW_IO_N: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_n = intval; + break; + case VDEV_PROP_SLOW_IO_T: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_t = intval; + break; + default: + /* Most processing is done in vdev_props_set_sync */ + break; + } +end: + if (error != 0) { + intval = error; + vdev_prop_add_list(outnvl, propname, strval, intval, 0); + return (error); + } + } + + return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, + innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +int +vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + int err = 0; + uint64_t objid; + uint64_t vdev_guid; + nvpair_t *elem = NULL; + nvlist_t *nvprops = NULL; + uint64_t intval = 0; + char *strval = NULL; + const char *propname = NULL; + vdev_prop_t prop; + + ASSERT(vd != NULL); + ASSERT(mos != NULL); + + if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, + &vdev_guid) != 0) + return (SET_ERROR(EINVAL)); + + nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); + + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { + objid = vd->vdev_top_zap; + } else if (vd->vdev_leaf_zap != 0) { + objid = vd->vdev_leaf_zap; + } else { + return (SET_ERROR(EINVAL)); + } + ASSERT(objid != 0); + + mutex_enter(&spa->spa_props_lock); + + if (nvprops != NULL) { + char namebuf[64] = { 0 }; + + while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { + intval = 0; + strval = NULL; + propname = nvpair_name(elem); + prop = vdev_name_to_prop(propname); + zprop_source_t src = ZPROP_SRC_DEFAULT; + uint64_t integer_size, num_integers; + + switch (prop) { + /* Special Read-only Properties */ + case VDEV_PROP_NAME: + strval = vdev_name(vd, namebuf, + sizeof (namebuf)); + if (strval == NULL) + continue; + vdev_prop_add_list(outnvl, propname, strval, 0, + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_CAPACITY: + /* percent used */ + intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : + (vd->vdev_stat.vs_alloc * 100 / + vd->vdev_stat.vs_dspace); + vdev_prop_add_list(outnvl, propname, NULL, + intval, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_STATE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_state, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_GUID: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_guid, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_ASIZE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_asize, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_PSIZE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_psize, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_ASHIFT: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_ashift, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_SIZE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_FREE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_dspace - + vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_ALLOCATED: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_EXPANDSZ: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_FRAGMENTATION: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_fragmentation, + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_PARITY: + vdev_prop_add_list(outnvl, propname, NULL, + vdev_get_nparity(vd), ZPROP_SRC_NONE); + continue; + case VDEV_PROP_PATH: + if (vd->vdev_path == NULL) + continue; + vdev_prop_add_list(outnvl, propname, + vd->vdev_path, 0, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_DEVID: + if (vd->vdev_devid == NULL) + continue; + vdev_prop_add_list(outnvl, propname, + vd->vdev_devid, 0, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_PHYS_PATH: + if (vd->vdev_physpath == NULL) + continue; + vdev_prop_add_list(outnvl, propname, + vd->vdev_physpath, 0, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_ENC_PATH: + if (vd->vdev_enc_sysfs_path == NULL) + continue; + vdev_prop_add_list(outnvl, propname, + vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_FRU: + if (vd->vdev_fru == NULL) + continue; + vdev_prop_add_list(outnvl, propname, + vd->vdev_fru, 0, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_PARENT: + if (vd->vdev_parent != NULL) { + strval = vdev_name(vd->vdev_parent, + namebuf, sizeof (namebuf)); + vdev_prop_add_list(outnvl, propname, + strval, 0, ZPROP_SRC_NONE); + } + continue; + case VDEV_PROP_CHILDREN: + if (vd->vdev_children > 0) + strval = kmem_zalloc(ZAP_MAXVALUELEN, + KM_SLEEP); + for (uint64_t i = 0; i < vd->vdev_children; + i++) { + const char *vname; + + vname = vdev_name(vd->vdev_child[i], + namebuf, sizeof (namebuf)); + if (vname == NULL) + vname = "(unknown)"; + if (strlen(strval) > 0) + strlcat(strval, ",", + ZAP_MAXVALUELEN); + strlcat(strval, vname, ZAP_MAXVALUELEN); + } + if (strval != NULL) { + vdev_prop_add_list(outnvl, propname, + strval, 0, ZPROP_SRC_NONE); + kmem_free(strval, ZAP_MAXVALUELEN); + } + continue; + case VDEV_PROP_NUMCHILDREN: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_children, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_READ_ERRORS: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_read_errors, + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_WRITE_ERRORS: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_write_errors, + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_CHECKSUM_ERRORS: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_checksum_errors, + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_INITIALIZE_ERRORS: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_initialize_errors, + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_OPS_NULL: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_OPS_READ: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_ops[ZIO_TYPE_READ], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_OPS_WRITE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_OPS_FREE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_OPS_CLAIM: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_OPS_TRIM: + /* + * TRIM ops and bytes are reported to user + * space as ZIO_TYPE_FLUSH. This is done to + * preserve the vdev_stat_t structure layout + * for user space. + */ + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_BYTES_NULL: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_BYTES_READ: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_BYTES_WRITE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_BYTES_FREE: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_BYTES_CLAIM: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_BYTES_TRIM: + /* + * TRIM ops and bytes are reported to user + * space as ZIO_TYPE_FLUSH. This is done to + * preserve the vdev_stat_t structure layout + * for user space. + */ + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], + ZPROP_SRC_NONE); + continue; + case VDEV_PROP_REMOVING: + vdev_prop_add_list(outnvl, propname, NULL, + vd->vdev_removing, ZPROP_SRC_NONE); + continue; + case VDEV_PROP_RAIDZ_EXPANDING: + /* Only expose this for raidz */ + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_prop_add_list(outnvl, propname, + NULL, vd->vdev_rz_expanding, + ZPROP_SRC_NONE); + } + continue; + /* Numeric Properites */ + case VDEV_PROP_ALLOCATING: + /* Leaf vdevs cannot have this property */ + if (vd->vdev_mg == NULL && + vd->vdev_top != NULL) { + src = ZPROP_SRC_NONE; + intval = ZPROP_BOOLEAN_NA; + } else { + err = vdev_prop_get_int(vd, prop, + &intval); + if (err && err != ENOENT) + break; + + if (intval == + vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + } + + vdev_prop_add_list(outnvl, propname, NULL, + intval, src); + break; + case VDEV_PROP_FAILFAST: + src = ZPROP_SRC_LOCAL; + strval = NULL; + + err = zap_lookup(mos, objid, nvpair_name(elem), + sizeof (uint64_t), 1, &intval); + if (err == ENOENT) { + intval = vdev_prop_default_numeric( + prop); + err = 0; + } else if (err) { + break; + } + if (intval == vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + + vdev_prop_add_list(outnvl, propname, strval, + intval, src); + break; + case VDEV_PROP_CHECKSUM_N: + case VDEV_PROP_CHECKSUM_T: + case VDEV_PROP_IO_N: + case VDEV_PROP_IO_T: + case VDEV_PROP_SLOW_IO_N: + case VDEV_PROP_SLOW_IO_T: + err = vdev_prop_get_int(vd, prop, &intval); + if (err && err != ENOENT) + break; + + if (intval == vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + + vdev_prop_add_list(outnvl, propname, NULL, + intval, src); + break; + /* Text Properties */ + case VDEV_PROP_COMMENT: + /* Exists in the ZAP below */ + /* FALLTHRU */ + case VDEV_PROP_USERPROP: + /* User Properites */ + src = ZPROP_SRC_LOCAL; + + err = zap_length(mos, objid, nvpair_name(elem), + &integer_size, &num_integers); + if (err) + break; + + switch (integer_size) { + case 8: + /* User properties cannot be integers */ + err = EINVAL; + break; + case 1: + /* string property */ + strval = kmem_alloc(num_integers, + KM_SLEEP); + err = zap_lookup(mos, objid, + nvpair_name(elem), 1, + num_integers, strval); + if (err) { + kmem_free(strval, + num_integers); + break; + } + vdev_prop_add_list(outnvl, propname, + strval, 0, src); + kmem_free(strval, num_integers); + break; + } + break; + default: + err = ENOENT; + break; + } + if (err) + break; + } + } else { + /* + * Get all properties from the MOS vdev property object. + */ + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, mos, objid); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + intval = 0; + strval = NULL; + zprop_source_t src = ZPROP_SRC_DEFAULT; + propname = za.za_name; + + switch (za.za_integer_length) { + case 8: + /* We do not allow integer user properties */ + /* This is likely an internal value */ + break; + case 1: + /* string property */ + strval = kmem_alloc(za.za_num_integers, + KM_SLEEP); + err = zap_lookup(mos, objid, za.za_name, 1, + za.za_num_integers, strval); + if (err) { + kmem_free(strval, za.za_num_integers); + break; + } + vdev_prop_add_list(outnvl, propname, strval, 0, + src); + kmem_free(strval, za.za_num_integers); + break; + + default: + break; + } + } + zap_cursor_fini(&zc); + } + + mutex_exit(&spa->spa_props_lock); + if (err && err != ENOENT) { + return (err); + } + + return (0); +} + EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW, - "Default limit for metaslab size"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, + "Default lower limit for metaslab size"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, + "Default upper limit for metaslab size"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); +ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, + "Rate limit hung IO (deadman) events to this many per second"); + +/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " - "(do not set below zed threshold)."); + "(do not set below ZED threshold)."); +/* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); @@ -5439,15 +6499,16 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); -ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); +/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, - param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, + param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, - param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, + param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c deleted file mode 100644 index 6e82184b800d..000000000000 --- a/sys/contrib/openzfs/module/zfs/vdev_cache.c +++ /dev/null @@ -1,437 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -#include <sys/zfs_context.h> -#include <sys/spa.h> -#include <sys/vdev_impl.h> -#include <sys/zio.h> -#include <sys/kstat.h> -#include <sys/abd.h> - -/* - * Virtual device read-ahead caching. - * - * This file implements a simple LRU read-ahead cache. When the DMU reads - * a given block, it will often want other, nearby blocks soon thereafter. - * We take advantage of this by reading a larger disk region and caching - * the result. In the best case, this can turn 128 back-to-back 512-byte - * reads into a single 64k read followed by 127 cache hits; this reduces - * latency dramatically. In the worst case, it can turn an isolated 512-byte - * read into a 64k read, which doesn't affect latency all that much but is - * terribly wasteful of bandwidth. A more intelligent version of the cache - * could keep track of access patterns and not do read-ahead unless it sees - * at least two temporally close I/Os to the same region. Currently, only - * metadata I/O is inflated. A further enhancement could take advantage of - * more semantic information about the I/O. And it could use something - * faster than an AVL tree; that was chosen solely for convenience. - * - * There are five cache operations: allocate, fill, read, write, evict. - * - * (1) Allocate. This reserves a cache entry for the specified region. - * We separate the allocate and fill operations so that multiple threads - * don't generate I/O for the same cache miss. - * - * (2) Fill. When the I/O for a cache miss completes, the fill routine - * places the data in the previously allocated cache entry. - * - * (3) Read. Read data from the cache. - * - * (4) Write. Update cache contents after write completion. - * - * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry - * if the total cache size exceeds zfs_vdev_cache_size. - */ - -/* - * These tunables are for performance analysis. - */ -/* - * All i/os smaller than zfs_vdev_cache_max will be turned into - * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software - * track buffer). At most zfs_vdev_cache_size bytes will be kept in each - * vdev's vdev_cache. - * - * TODO: Note that with the current ZFS code, it turns out that the - * vdev cache is not helpful, and in some cases actually harmful. It - * is better if we disable this. Once some time has passed, we should - * actually remove this to simplify the code. For now we just disable - * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 - * has made these same changes. - */ -int zfs_vdev_cache_max = 1<<14; /* 16KB */ -int zfs_vdev_cache_size = 0; -int zfs_vdev_cache_bshift = 16; - -#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ - -kstat_t *vdc_ksp = NULL; - -typedef struct vdc_stats { - kstat_named_t vdc_stat_delegations; - kstat_named_t vdc_stat_hits; - kstat_named_t vdc_stat_misses; -} vdc_stats_t; - -static vdc_stats_t vdc_stats = { - { "delegations", KSTAT_DATA_UINT64 }, - { "hits", KSTAT_DATA_UINT64 }, - { "misses", KSTAT_DATA_UINT64 } -}; - -#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64); - -static inline int -vdev_cache_offset_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; - const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - - return (TREE_CMP(ve1->ve_offset, ve2->ve_offset)); -} - -static int -vdev_cache_lastused_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; - const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - - int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); - if (likely(cmp)) - return (cmp); - - /* - * Among equally old entries, sort by offset to ensure uniqueness. - */ - return (vdev_cache_offset_compare(a1, a2)); -} - -/* - * Evict the specified entry from the cache. - */ -static void -vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) -{ - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - ASSERT3P(ve->ve_abd, !=, NULL); - - avl_remove(&vc->vc_lastused_tree, ve); - avl_remove(&vc->vc_offset_tree, ve); - abd_free(ve->ve_abd); - kmem_free(ve, sizeof (vdev_cache_entry_t)); -} - -/* - * Allocate an entry in the cache. At the point we don't have the data, - * we're just creating a placeholder so that multiple threads don't all - * go off and read the same blocks. - */ -static vdev_cache_entry_t * -vdev_cache_allocate(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - uint64_t offset = P2ALIGN(zio->io_offset, VCBS); - vdev_cache_entry_t *ve; - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - - if (zfs_vdev_cache_size == 0) - return (NULL); - - /* - * If adding a new entry would exceed the cache size, - * evict the oldest entry (LRU). - */ - if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > - zfs_vdev_cache_size) { - ve = avl_first(&vc->vc_lastused_tree); - if (ve->ve_fill_io != NULL) - return (NULL); - ASSERT3U(ve->ve_hits, !=, 0); - vdev_cache_evict(vc, ve); - } - - ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); - ve->ve_offset = offset; - ve->ve_lastused = ddi_get_lbolt(); - ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); - - avl_add(&vc->vc_offset_tree, ve); - avl_add(&vc->vc_lastused_tree, ve); - - return (ve); -} - -static void -vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) -{ - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - - if (ve->ve_lastused != ddi_get_lbolt()) { - avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = ddi_get_lbolt(); - avl_add(&vc->vc_lastused_tree, ve); - } - - ve->ve_hits++; - abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); -} - -/* - * Fill a previously allocated cache entry with data. - */ -static void -vdev_cache_fill(zio_t *fio) -{ - vdev_t *vd = fio->io_vd; - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = fio->io_private; - zio_t *pio; - - ASSERT3U(fio->io_size, ==, VCBS); - - /* - * Add data to the cache. - */ - mutex_enter(&vc->vc_lock); - - ASSERT3P(ve->ve_fill_io, ==, fio); - ASSERT3U(ve->ve_offset, ==, fio->io_offset); - ASSERT3P(ve->ve_abd, ==, fio->io_abd); - - ve->ve_fill_io = NULL; - - /* - * Even if this cache line was invalidated by a missed write update, - * any reads that were queued up before the missed update are still - * valid, so we can satisfy them from this line before we evict it. - */ - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(fio, &zl)) != NULL) - vdev_cache_hit(vc, ve, pio); - - if (fio->io_error || ve->ve_missed_update) - vdev_cache_evict(vc, ve); - - mutex_exit(&vc->vc_lock); -} - -/* - * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. - */ -boolean_t -vdev_cache_read(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, *ve_search; - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); - zio_t *fio; - uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS); - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (B_FALSE); - - if (zio->io_size > zfs_vdev_cache_max) - return (B_FALSE); - - /* - * If the I/O straddles two or more cache blocks, don't cache it. - */ - if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) - return (B_FALSE); - - ASSERT3U(cache_phase + zio->io_size, <=, VCBS); - - mutex_enter(&vc->vc_lock); - - ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP); - ve_search->ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); - kmem_free(ve_search, sizeof (vdev_cache_entry_t)); - - if (ve != NULL) { - if (ve->ve_missed_update) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - if ((fio = ve->ve_fill_io) != NULL) { - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_delegations); - return (B_TRUE); - } - - vdev_cache_hit(vc, ve, zio); - zio_vdev_io_bypass(zio); - - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_hits); - return (B_TRUE); - } - - ve = vdev_cache_allocate(zio); - - if (ve == NULL) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); - - ve->ve_fill_io = fio; - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - - mutex_exit(&vc->vc_lock); - zio_nowait(fio); - VDCSTAT_BUMP(vdc_stat_misses); - - return (B_TRUE); -} - -/* - * Update cache contents upon write completion. - */ -void -vdev_cache_write(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t io_start = zio->io_offset; - uint64_t io_end = io_start + zio->io_size; - uint64_t min_offset = P2ALIGN(io_start, VCBS); - uint64_t max_offset = P2ROUNDUP(io_end, VCBS); - avl_index_t where; - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = min_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); - - if (ve == NULL) - ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); - - while (ve != NULL && ve->ve_offset < max_offset) { - uint64_t start = MAX(ve->ve_offset, io_start); - uint64_t end = MIN(ve->ve_offset + VCBS, io_end); - - if (ve->ve_fill_io != NULL) { - ve->ve_missed_update = 1; - } else { - abd_copy_off(ve->ve_abd, zio->io_abd, - start - ve->ve_offset, start - io_start, - end - start); - } - ve = AVL_NEXT(&vc->vc_offset_tree, ve); - } - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_purge(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_init(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_offset_node)); - - avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_lastused_node)); -} - -void -vdev_cache_fini(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - vdev_cache_purge(vd); - - avl_destroy(&vc->vc_offset_tree); - avl_destroy(&vc->vc_lastused_tree); - - mutex_destroy(&vc->vc_lock); -} - -void -vdev_cache_stat_init(void) -{ - vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (vdc_ksp != NULL) { - vdc_ksp->ks_data = &vdc_stats; - kstat_install(vdc_ksp); - } -} - -void -vdev_cache_stat_fini(void) -{ - if (vdc_ksp != NULL) { - kstat_delete(vdc_ksp); - vdc_ksp = NULL; - } -} - -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, INT, ZMOD_RW, - "Inflate reads small than max"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, INT, ZMOD_RD, - "Total size of the per-disk cache"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, INT, ZMOD_RW, - "Shift size to inflate reads too"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c index b8f82d52e8f0..13bb33cc6871 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_draid.c +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -541,7 +541,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) int vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) { - for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) { if (draid_maps[i].dm_children == children) { *mapp = &draid_maps[i]; return (0); @@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc, * i.e. vdev_draid_psize_to_asize(). */ static uint64_t -vdev_draid_asize(vdev_t *vd, uint64_t psize) +vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { + (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_ashift; @@ -842,6 +843,53 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) } /* + * Verify that all empty sectors are zero filled before using them to + * calculate parity. Otherwise, silent corruption in an empty sector will + * result in bad parity being generated. That bad parity will then be + * considered authoritative and overwrite the good parity on disk. This + * is possible because the checksum is only calculated over the data, + * thus it cannot be used to detect damage in empty sectors. + */ +int +vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t skip_off = parity_size - skip_size; + uint64_t empty_off = 0; + int ret = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, !=, NULL); + ASSERT3U(rr->rr_bigcols, >, 0); + + void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP); + + for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + ASSERT3P(rc->rc_abd, !=, NULL); + ASSERT3U(rc->rc_size, ==, parity_size); + + if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off, + skip_size) != 0) { + vdev_raidz_checksum_error(zio, rc, rc->rc_abd); + abd_zero_off(rc->rc_abd, skip_off, skip_size); + rc->rc_error = SET_ERROR(ECKSUM); + ret++; + } + + empty_off += skip_size; + } + + ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty)); + + kmem_free(zero_buf, skip_size); + + return (ret); +} + +/* * Given a logical address within a dRAID configuration, return the physical * address on the first drive in the group that this address maps to * (at position 'start' in permutation number 'perm'). @@ -913,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; - uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); @@ -976,15 +1024,11 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, /* The total number of data and parity sectors for this I/O. */ uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); - raidz_row_t *rr; - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); - rr->rr_cols = groupwidth; - rr->rr_scols = groupwidth; + ASSERT3U(vdc->vdc_nparity, >, 0); + + raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth); rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = vdc->vdc_nparity; - rr->rr_abd_empty = NULL; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; @@ -1004,14 +1048,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (q == 0 && i >= bc) rc->rc_size = 0; @@ -1080,7 +1116,7 @@ vdev_draid_map_alloc(zio_t *zio) if (size < abd_size) { vdev_t *vd = zio->io_vd; - io_offset += vdev_draid_asize(vd, size); + io_offset += vdev_draid_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; @@ -1102,7 +1138,6 @@ vdev_draid_map_alloc(zio_t *zio) rm->rm_row[0] = rr[0]; if (nrows == 2) rm->rm_row[1] = rr[1]; - return (rm); } @@ -1449,8 +1484,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); - physical_ashift = MAX(physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + physical_ashift = vdev_best_ashift(logical_ashift, + physical_ashift, cvd->vdev_physical_ashift); } *asizep = asize; @@ -1678,7 +1719,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, uint64_t nparity = vdc->vdc_nparity; for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { - bzero(path, sizeof (path)); + memset(path, 0, sizeof (path)); (void) snprintf(path, sizeof (path) - 1, "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, (u_longlong_t)nparity, @@ -1707,7 +1748,7 @@ vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, if (n > 0) { (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - new_spares, n); + (const nvlist_t **)new_spares, n); } for (int i = 0; i < n; i++) @@ -1728,7 +1769,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = vdev_draid_asize(vd, psize); + uint64_t asize = vdev_draid_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* @@ -1785,7 +1826,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_draid_asize(vd, rr->rr_size); + vdev_draid_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; @@ -2154,6 +2195,7 @@ vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) static int vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) { + (void) spa; uint64_t ndata, nparity, nspares, ngroups; int error; @@ -2382,7 +2424,6 @@ vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) return (cvd); } -/* ARGSUSED */ static void vdev_draid_spare_close(vdev_t *vd) { @@ -2507,24 +2548,20 @@ vdev_draid_read_config_spare(vdev_t *vd) } /* - * Handle any ioctl requested of the distributed spare. Only flushes - * are supported in which case all children must be flushed. + * Handle any flush requested of the distributed spare. All children must be + * flushed. */ static int -vdev_draid_spare_ioctl(zio_t *zio) +vdev_draid_spare_flush(zio_t *zio) { vdev_t *vd = zio->io_vd; int error = 0; - if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { - for (int c = 0; c < vd->vdev_children; c++) { - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[c], zio->io_offset, zio->io_abd, - zio->io_size, zio->io_type, zio->io_priority, 0, - vdev_draid_spare_child_done, zio)); - } - } else { - error = SET_ERROR(ENOTSUP); + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); } return (error); @@ -2555,8 +2592,8 @@ vdev_draid_spare_io_start(zio_t *zio) } switch (zio->io_type) { - case ZIO_TYPE_IOCTL: - zio->io_error = vdev_draid_spare_ioctl(zio); + case ZIO_TYPE_FLUSH: + zio->io_error = vdev_draid_spare_flush(zio); break; case ZIO_TYPE_WRITE: @@ -2641,10 +2678,10 @@ vdev_draid_spare_io_start(zio_t *zio) zio_execute(zio); } -/* ARGSUSED */ static void vdev_draid_spare_io_done(zio_t *zio) { + (void) zio; } /* @@ -2665,7 +2702,7 @@ vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, return (SET_ERROR(ENOENT)); } - char *spare_name; + const char *spare_name; error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); if (error != 0) return (SET_ERROR(EINVAL)); @@ -2673,7 +2710,7 @@ vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, for (int i = 0; i < nspares; i++) { nvlist_t *spare = spares[i]; uint64_t top_guid, spare_id; - char *type, *path; + const char *type, *path; /* Skip non-distributed spares */ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index 14ebf5514676..acb725696674 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -48,8 +48,8 @@ * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * - * - i/os to this vdev use the callback to determine where the - * data is now located, and issue child i/os for each segment's new + * - I/Os to this vdev use the callback to determine where the + * data is now located, and issue child I/Os for each segment's new * location. * * - frees and claims to this vdev use the callback to free or claim @@ -172,7 +172,7 @@ * object. */ -int zfs_condense_indirect_vdevs_enable = B_TRUE; +static int zfs_condense_indirect_vdevs_enable = B_TRUE; /* * Condense if at least this percent of the bytes in the mapping is @@ -181,7 +181,7 @@ int zfs_condense_indirect_vdevs_enable = B_TRUE; * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ -int zfs_condense_indirect_obsolete_pct = 25; +static uint_t zfs_condense_indirect_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of @@ -189,14 +189,14 @@ int zfs_condense_indirect_obsolete_pct = 25; * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ -unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; +static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ -unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; +static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain @@ -204,7 +204,7 @@ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; * complete too quickly). If used to reduce the performance impact of * condensing in production, a maximum value of 1 should be sufficient. */ -int zfs_condense_indirect_commit_entry_delay_ms = 0; +static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0; /* * If an indirect split block contains more than this many possible unique @@ -214,7 +214,7 @@ int zfs_condense_indirect_commit_entry_delay_ms = 0; * copies to participate fairly in the reconstruction when all combinations * cannot be checked and prevents repeated use of one bad copy. */ -int zfs_reconstruct_indirect_combinations_max = 4096; +uint_t zfs_reconstruct_indirect_combinations_max = 4096; /* * Enable to simulate damaged segments and validate reconstruction. This @@ -270,7 +270,7 @@ typedef struct indirect_split { */ indirect_child_t *is_good_child; - indirect_child_t is_child[1]; /* variable-length */ + indirect_child_t is_child[]; } indirect_split_t; /* @@ -293,17 +293,16 @@ vdev_indirect_map_free(zio_t *zio) indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; - while ((is = list_head(&iv->iv_splits)) != NULL) { + while ((is = list_remove_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } - list_remove(&iv->iv_splits, is); indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; list_destroy(&is->is_unique_child); @@ -637,16 +636,15 @@ spa_condense_indirect_generate_new_mapping(vdev_t *vd, } } -/* ARGSUSED */ static boolean_t spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { + (void) zthr; spa_t *spa = arg; return (spa->spa_condensing_indirect != NULL); } -/* ARGSUSED */ static void spa_condense_indirect_thread(void *arg, zthr_t *zthr) { @@ -941,13 +939,12 @@ vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) return (error); } -/* ARGSUSED */ static void vdev_indirect_close(vdev_t *vd) { + (void) vd; } -/* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) @@ -1023,7 +1020,7 @@ vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, size_t copy_length = entries * sizeof (*first_mapping); duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); - bcopy(first_mapping, duplicate_mappings, copy_length); + memcpy(duplicate_mappings, first_mapping, copy_length); *copied_entries = entries; return (duplicate_mappings); @@ -1321,6 +1318,7 @@ vdev_indirect_io_start(zio_t *zio) vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); + ASSERT3P(first, !=, NULL); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire @@ -1371,9 +1369,10 @@ vdev_indirect_io_start(zio_t *zio) is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, - abd_get_offset(zio->io_abd, - is->is_split_offset), is->is_size, - zio->io_type, zio->io_priority, 0, + abd_get_offset_size(zio->io_abd, + is->is_split_offset, is->is_size), + is->is_size, zio->io_type, + zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } @@ -1399,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio, vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); - zio_bad_cksum_t zbc = {{{ 0 }}}; + zio_bad_cksum_t zbc = { 0 }; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, @@ -1480,12 +1479,12 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vdev_t *vd = ic->ic_vdev; - (void) zfs_ereport_post_checksum(zio->io_spa, vd, - NULL, zio, is->is_target_offset, is->is_size, - NULL, NULL, NULL); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); + (void) zfs_ereport_post_checksum(zio->io_spa, vd, + NULL, zio, is->is_target_offset, is->is_size, + NULL, NULL, NULL); } } } @@ -1659,8 +1658,8 @@ out: for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; is->is_unique_children = 0; } @@ -1885,23 +1884,28 @@ EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, - "Whether to attempt condensing indirect vdev mappings"); +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, + ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); -ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, INT, ZMOD_RW, - "Minimum obsolete percent of bytes in the mapping to attempt condensing"); +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT, + ZMOD_RW, + "Minimum obsolete percent of bytes in the mapping " + "to attempt condensing"); -ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW, "Don't bother condensing if the mapping uses less than this amount of " "memory"); -ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64, + ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); -ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, + UINT, ZMOD_RW, "Used by tests to ensure certain actions happen in the middle of a " "condense. A maximum value of 1 should be sufficient."); -ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, + UINT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c index 99b83c392257..65a57e73604f 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c @@ -38,6 +38,8 @@ vdev_indirect_births_verify(vdev_indirect_births_t *vib) return (B_TRUE); } +#else +#define vdev_indirect_births_verify(vib) ((void) sizeof (vib), B_TRUE) #endif uint64_t @@ -150,7 +152,7 @@ vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, new_entries = vmem_alloc(new_size, KM_SLEEP); if (old_size > 0) { - bcopy(vib->vib_entries, new_entries, old_size); + memcpy(new_entries, vib->vib_entries, old_size); vmem_free(vib->vib_entries, old_size); } new_entries[vib->vib_phys->vib_count - 1] = vibe; diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c index bb484a401b1b..e92495f2dd34 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c @@ -54,6 +54,8 @@ vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) return (B_TRUE); } +#else +#define vdev_indirect_mapping_verify(vim) ((void) sizeof (vim), B_TRUE) #endif uint64_t @@ -480,7 +482,7 @@ vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); vim->vim_entries = vmem_alloc(new_size, KM_SLEEP); if (old_size > 0) { - bcopy(old_entries, vim->vim_entries, old_size); + memcpy(vim->vim_entries, old_entries, old_size); vmem_free(old_entries, old_size); } VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, @@ -582,7 +584,7 @@ vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) 0, counts_size, counts, DMU_READ_PREFETCH)); } else { - bzero(counts, counts_size); + memset(counts, 0, counts_size); } return (counts); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c index e9156c32f384..0a7323f58df2 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c +++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. */ #include <sys/spa.h> @@ -36,23 +36,20 @@ /* * Value that is written to disk during initialization. */ -#ifdef _ILP32 -unsigned long zfs_initialize_value = 0xdeadbeefUL; -#else -unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; -#endif +static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL; /* maximum number of I/Os outstanding per leaf vdev */ -int zfs_initialize_limit = 1; +static const int zfs_initialize_limit = 1; /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ -unsigned long zfs_initialize_chunk_size = 1024 * 1024; +static uint64_t zfs_initialize_chunk_size = 1024 * 1024; static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } static void @@ -71,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; @@ -101,6 +99,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) } static void +vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx) +{ + uint64_t guid = *(uint64_t *)arg; + + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE); + ASSERT3U(vd->vdev_leaf_zap, !=, 0); + + vd->vdev_initialize_last_offset = 0; + vd->vdev_initialize_action_time = 0; + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + int error; + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx); + VERIFY(error == 0 || error == ENOENT); + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, tx); + VERIFY(error == 0 || error == ENOENT); + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx); + VERIFY(error == 0 || error == ENOENT); +} + +static void vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); @@ -127,8 +158,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, - guid, tx); + + if (new_state != VDEV_INITIALIZE_NONE) { + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, tx); + } else { + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_remove_sync, guid, tx); + } switch (new_state) { case VDEV_INITIALIZE_ACTIVE: @@ -149,6 +186,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) spa_history_log_internal(spa, "initialize", tx, "vdev=%s complete", vd->vdev_path); break; + case VDEV_INITIALIZE_NONE: + spa_history_log_internal(spa, "uninitialize", tx, + "vdev=%s", vd->vdev_path); + break; default: panic("invalid state %llu", (unsigned long long)new_state); } @@ -255,20 +296,15 @@ vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD * allocation will guarantee these for us. */ -/* ARGSUSED */ static int vdev_initialize_block_fill(void *buf, size_t len, void *unused) { + (void) unused; + ASSERT0(len % sizeof (uint64_t)); -#ifdef _ILP32 - for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { - *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; - } -#else for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; } -#endif return (0); } @@ -487,7 +523,7 @@ vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); } -static void +static __attribute__((noreturn)) void vdev_initialize_thread(void *arg) { vdev_t *vd = arg; @@ -597,6 +633,7 @@ vdev_initialize(vdev_t *vd) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, @@ -604,6 +641,24 @@ vdev_initialize(vdev_t *vd) } /* + * Uninitializes a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_uninitialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE); +} + +/* * Wait for the initialize thread to be terminated (cancelled or stopped). */ static void @@ -624,9 +679,11 @@ vdev_initialize_stop_wait_impl(vdev_t *vd) void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) { + (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_initialize_lock); @@ -668,7 +725,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, if (vd_list == NULL) { vdev_initialize_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -700,7 +758,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) spa_t *spa = vd->vdev_spa; list_t vd_list; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); @@ -719,7 +778,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) void vdev_initialize_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { @@ -738,13 +798,14 @@ vdev_initialize_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = timestamp; - if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_initialize_load(vd)); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_initialize_thread == NULL) { vdev_initialize(vd); } @@ -758,15 +819,14 @@ vdev_initialize_restart(vdev_t *vd) } EXPORT_SYMBOL(vdev_initialize); +EXPORT_SYMBOL(vdev_uninitialize); EXPORT_SYMBOL(vdev_initialize_stop); EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize_stop_wait); EXPORT_SYMBOL(vdev_initialize_restart); -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW, "Value written during zpool initialize"); -ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW, "Size in bytes of writes by zpool initialize"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index daf53f0a0c8b..ed592514fded 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -142,6 +142,7 @@ #include <sys/zap.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/vdev_raidz.h> #include <sys/vdev_draid.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> @@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } + + pool_raidz_expand_stat_t pres; + if (spa_raidz_expand_get_stats(spa, &pres) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, + sizeof (pres) / sizeof (uint64_t)); + } } static void @@ -486,6 +494,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + if (flags & VDEV_CONFIG_L2CACHE) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, @@ -496,7 +507,16 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, vd->vdev_asize); fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); - if (vd->vdev_removing) { + if (vd->vdev_noalloc) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, + vd->vdev_noalloc); + } + + /* + * Slog devices are removed synchronously so don't + * persist the vdev_removing flag to the label. + */ + if (vd->vdev_removing && !vd->vdev_islog) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } @@ -564,6 +584,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_top_zap); } + if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 && + spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, + vd->vdev_root_zap); + } + if (vd->vdev_resilver_deferred) { ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(spa->spa_resilver_deferred); @@ -640,35 +666,22 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; - int c, idx; + uint64_t c; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - for (c = 0, idx = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - /* - * If we're generating an nvlist of removing - * vdevs then skip over any device which is - * not being removed. - */ - if ((flags & VDEV_CONFIG_REMOVING) && - !cvd->vdev_removing) - continue; - - child[idx++] = vdev_config_generate(spa, cvd, + for (c = 0; c < vd->vdev_children; c++) { + child[c] = vdev_config_generate(spa, vd->vdev_child[c], getstats, flags); } - if (idx) { - fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - child, idx); - } + fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + (const nvlist_t * const *)child, vd->vdev_children); - for (c = 0; c < idx; c++) + for (c = 0; c < vd->vdev_children; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); @@ -1018,6 +1031,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason == + VDEV_LABEL_REMOVE && vd->vdev_isspare)); + boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason == + VDEV_LABEL_REMOVE && vd->vdev_isl2cache)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -1103,36 +1120,58 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). */ - if (reason == VDEV_LABEL_SPARE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + if (reason_spare || reason_l2cache) { /* - * For inactive hot spares, we generate a special label that - * identifies as a mutually shared hot spare. We write the - * label if we are adding a hot spare, or if we are removing an - * active hot spare (in which case we want to revert the - * labels). + * For inactive hot spares and level 2 ARC devices, we generate + * a special label that identifies as a mutually shared hot + * spare or l2cache device. We write the label in case of + * addition or removal of hot spare or l2cache vdev (in which + * case we want to revert the labels). */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_SPARE) == 0); + reason_spare ? POOL_STATE_SPARE : POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); - } else if (reason == VDEV_LABEL_L2CACHE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { + /* - * For level 2 ARC devices, add a special label. + * This is merely to facilitate reporting the ashift of the + * cache device through zdb. The actual retrieval of the + * ashift (in vdev_alloc()) uses the nvlist + * spa->spa_l2cache->sav_config (populated in + * spa_ld_open_aux_vdevs()). */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (reason_l2cache) { + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); + } - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_L2CACHE) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); + /* + * Add path information to help find it during pool import + */ + if (vd->vdev_path != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PATH, + vd->vdev_path) == 0); + } + if (vd->vdev_devid != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_DEVID, + vd->vdev_devid) == 0); + } + if (vd->vdev_physpath != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + } + + /* + * When spare or l2cache (aux) vdev is added during pool + * creation, spa->spa_uberblock is not written until this + * point. Write it on next config sync. + */ + if (uberblock_verify(&spa->spa_uberblock)) + spa->spa_aux_sync_uber = B_TRUE; } else { uint64_t txg = 0ULL; @@ -1164,8 +1203,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Initialize uberblock template. */ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); - abd_zero(ub_abd, VDEV_UBERBLOCK_RING); abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + abd_zero_off(ub_abd, sizeof (uberblock_t), + VDEV_UBERBLOCK_RING - sizeof (uberblock_t)); ub = abd_to_buf(ub_abd); ub->ub_txg = 0; @@ -1320,7 +1360,7 @@ vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) nvlist_free(config); break; } - fallthrough; + zfs_fallthrough; default: /* Check for FreeBSD zfs bootonce command string */ buf = abd_to_buf(abd); @@ -1355,6 +1395,7 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) int error; size_t nvsize; char *nvbuf; + const char *tmp; error = nvlist_size(env, &nvsize, NV_ENCODE_XDR); if (error != 0) @@ -1394,8 +1435,8 @@ vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); switch (bootenv->vbe_version) { case VB_RAW: - if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) { - (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize); + if (nvlist_lookup_string(env, GRUB_ENVMAP, &tmp) == 0) { + (void) strlcpy(bootenv->vbe_bootenv, tmp, nvsize); } error = 0; break; @@ -1488,7 +1529,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) } struct ubl_cbdata { - uberblock_t *ubl_ubbest; /* Best uberblock */ + uberblock_t ubl_latest; /* Most recent uberblock */ + uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */ vdev_t *ubl_vd; /* vdev associated with the above */ }; @@ -1505,6 +1547,9 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); + if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) { + cbp->ubl_latest = *ub; + } if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* @@ -1561,11 +1606,11 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) ASSERT(ub); ASSERT(config); - bzero(ub, sizeof (uberblock_t)); + memset(ub, 0, sizeof (uberblock_t)); + memset(&cb, 0, sizeof (cb)); *config = NULL; cb.ubl_ubbest = ub; - cb.ubl_vd = NULL; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); @@ -1582,6 +1627,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != + cb.ubl_latest.ub_raidz_reflow_info) { + vdev_dbgmsg(cb.ubl_vd, + "spa=%s best uberblock (txg=%llu info=0x%llx) " + "has different raidz_reflow_info than latest " + "uberblock (txg=%llu info=0x%llx)", + spa->spa_name, + (u_longlong_t)ub->ub_txg, + (u_longlong_t)ub->ub_raidz_reflow_info, + (u_longlong_t)cb.ubl_latest.ub_txg, + (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info); + memset(ub, 0, sizeof (uberblock_t)); + spa_config_exit(spa, SCL_ALL, FTAG); + return; + } + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " @@ -1703,13 +1764,29 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, vd->vdev_copy_uberblocks = B_FALSE; } + /* + * We chose a slot based on the txg. If this uberblock has a special + * RAIDZ expansion state, then it is essentially an update of the + * current uberblock (it has the same txg). However, the current + * state is committed, so we want to write it to a different slot. If + * we overwrote the same slot, and we lose power during the uberblock + * write, and the disk does not do single-sector overwrites + * atomically (even though it is required to - i.e. we should see + * either the old or the new uberblock), then we could lose this + * txg's uberblock. Rewinding to the previous txg's uberblock may not + * be possible because RAIDZ expansion may have already overwritten + * some of the data, so we need the progress indicator in the + * uberblock. + */ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; - int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); + int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) % + (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); - abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); + abd_zero_off(ub_abd, sizeof (uberblock_t), + VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t)); for (int l = 0; l < VDEV_LABELS; l++) vdev_label_write(zio, vd, l, ub_abd, @@ -1721,7 +1798,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, } /* Sync the uberblocks to all vdevs in svd[] */ -static int +int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; @@ -1733,6 +1810,16 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); + if (spa->spa_aux_sync_uber) { + for (int v = 0; v < spa->spa_spares.sav_count; v++) { + vdev_uberblock_sync(zio, &good_writes, ub, + spa->spa_spares.sav_vdevs[v], flags); + } + for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { + vdev_uberblock_sync(zio, &good_writes, ub, + spa->spa_l2cache.sav_vdevs[v], flags); + } + } (void) zio_wait(zio); /* @@ -1747,6 +1834,19 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) zio_flush(zio, svd[v]); } } + if (spa->spa_aux_sync_uber) { + spa->spa_aux_sync_uber = B_FALSE; + for (int v = 0; v < spa->spa_spares.sav_count; v++) { + if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) { + zio_flush(zio, spa->spa_spares.sav_vdevs[v]); + } + } + for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { + if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) { + zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]); + } + } + } (void) zio_wait(zio); @@ -1927,6 +2027,7 @@ retry: /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, + * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ @@ -1934,7 +2035,8 @@ retry: boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); - if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + if (!changed && list_is_empty(&spa->spa_config_dirty_list) && + !spa_multihost(spa)) return (0); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c index 5eb331046953..102eacb03349 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c +++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -35,6 +35,7 @@ #include <sys/vdev_impl.h> #include <sys/vdev_draid.h> #include <sys/zio.h> +#include <sys/zio_checksum.h> #include <sys/abd.h> #include <sys/fs/zfs.h> @@ -102,6 +103,7 @@ vdev_mirror_stat_fini(void) */ typedef struct mirror_child { vdev_t *mc_vd; + abd_t *mc_abd; uint64_t mc_offset; int mc_error; int mc_load; @@ -121,7 +123,7 @@ typedef struct mirror_map { mirror_child_t mm_child[]; } mirror_map_t; -static int vdev_mirror_shift = 21; +static const int vdev_mirror_shift = 21; /* * The load configuration settings below are tuned by default for @@ -407,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { @@ -439,32 +447,6 @@ vdev_mirror_child_done(zio_t *zio) mc->mc_skipped = 0; } -static void -vdev_mirror_scrub_done(zio_t *zio) -{ - mirror_child_t *mc = zio->io_private; - - if (zio->io_error == 0) { - zio_t *pio; - zio_link_t *zl = NULL; - - mutex_enter(&zio->io_lock); - while ((pio = zio_walk_parents(zio, &zl)) != NULL) { - mutex_enter(&pio->io_lock); - ASSERT3U(zio->io_size, >=, pio->io_size); - abd_copy(pio->io_abd, zio->io_abd, pio->io_size); - mutex_exit(&pio->io_lock); - } - mutex_exit(&zio->io_lock); - } - - abd_free(zio->io_abd); - - mc->mc_error = zio->io_error; - mc->mc_tried = 1; - mc->mc_skipped = 0; -} - /* * Check the other, lower-index DVAs to see if they're on the same * vdev as the child we picked. If they are, use them since they @@ -549,7 +531,7 @@ vdev_mirror_child_select(zio_t *zio) uint64_t txg = zio->io_txg; int c, lowest_load; - ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); + ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; @@ -637,16 +619,15 @@ vdev_mirror_io_start(zio_t *zio) } if (zio->io_type == ZIO_TYPE_READ) { - if (zio->io_bp != NULL && - (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { + if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { /* - * For scrubbing reads (if we can verify the - * checksum here, as indicated by io_bp being - * non-NULL) we need to allocate a read buffer for - * each child and issue reads to all children. If - * any child succeeds, it will copy its data into - * zio->io_data in vdev_mirror_scrub_done. + * For scrubbing reads we need to issue reads to all + * children. One child can reuse parent buffer, but + * for others we have to allocate separate ones to + * verify checksums if io_bp is non-NULL, or compare + * them in vdev_mirror_io_done() otherwise. */ + boolean_t first = B_TRUE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -658,12 +639,15 @@ vdev_mirror_io_start(zio_t *zio) continue; } - zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, + mc->mc_abd = first ? zio->io_abd : abd_alloc_sametype(zio->io_abd, - zio->io_size), zio->io_size, - zio->io_type, zio->io_priority, 0, - vdev_mirror_scrub_done, mc)); + zio->io_size); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, mc->mc_abd, + zio->io_size, zio->io_type, + zio->io_priority, 0, + vdev_mirror_child_done, mc)); + first = B_FALSE; } zio_execute(zio); return; @@ -731,6 +715,7 @@ vdev_mirror_io_done(zio_t *zio) int c; int good_copies = 0; int unexpected_errors = 0; + int last_good_copy = -1; if (mm == NULL) return; @@ -742,6 +727,7 @@ vdev_mirror_io_done(zio_t *zio) if (!mc->mc_skipped) unexpected_errors++; } else if (mc->mc_tried) { + last_good_copy = c; good_copies++; } } @@ -755,7 +741,6 @@ vdev_mirror_io_done(zio_t *zio) * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ - /* XXPOLICY */ if (good_copies != mm->mm_children) { /* * Always require at least one good copy. @@ -782,7 +767,6 @@ vdev_mirror_io_done(zio_t *zio) /* * If we don't have a good copy yet, keep trying other children. */ - /* XXPOLICY */ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ASSERT(c >= 0 && c < mm->mm_children); mc = &mm->mm_child[c]; @@ -794,7 +778,80 @@ vdev_mirror_io_done(zio_t *zio) return; } - /* XXPOLICY */ + if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) { + abd_t *best_abd = NULL; + if (last_good_copy >= 0) + best_abd = mm->mm_child[last_good_copy].mc_abd; + + /* + * If we're scrubbing but don't have a BP available (because + * this vdev is under a raidz or draid vdev) then the best we + * can do is compare all of the copies read. If they're not + * identical then return a checksum error and the most likely + * correct data. The raidz code will issue a repair I/O if + * possible. + */ + if (zio->io_bp == NULL) { + ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops || + zio->io_vd->vdev_ops == &vdev_spare_ops); + + abd_t *pref_abd = NULL; + for (c = 0; c < last_good_copy; c++) { + mc = &mm->mm_child[c]; + if (mc->mc_error || !mc->mc_tried) + continue; + + if (abd_cmp(mc->mc_abd, best_abd) != 0) + zio->io_error = SET_ERROR(ECKSUM); + + /* + * The distributed spare is always prefered + * by vdev_mirror_child_select() so it's + * considered to be the best candidate. + */ + if (pref_abd == NULL && + mc->mc_vd->vdev_ops == + &vdev_draid_spare_ops) + pref_abd = mc->mc_abd; + + /* + * In the absence of a preferred copy, use + * the parent pointer to avoid a memory copy. + */ + if (mc->mc_abd == zio->io_abd) + best_abd = mc->mc_abd; + } + if (pref_abd) + best_abd = pref_abd; + } else { + + /* + * If we have a BP available, then checksums are + * already verified and we just need a buffer + * with valid data, preferring parent one to + * avoid a memory copy. + */ + for (c = 0; c < last_good_copy; c++) { + mc = &mm->mm_child[c]; + if (mc->mc_error || !mc->mc_tried) + continue; + if (mc->mc_abd == zio->io_abd) { + best_abd = mc->mc_abd; + break; + } + } + } + + if (best_abd && best_abd != zio->io_abd) + abd_copy(zio->io_abd, best_abd, zio->io_size); + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + if (mc->mc_abd != zio->io_abd) + abd_free(mc->mc_abd); + mc->mc_abd = NULL; + } + } + if (good_copies == 0) { zio->io_error = vdev_mirror_worst_error(mm); ASSERT(zio->io_error != 0); @@ -880,6 +937,8 @@ static uint64_t vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, uint64_t max_segment) { + (void) start; + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); @@ -961,20 +1020,21 @@ vdev_ops_t vdev_spare_ops = { .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, - "Rotating media load increment for non-seeking I/O's"); + "Rotating media load increment for non-seeking I/Os"); -ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW, - "Rotating media load increment for seeking I/O's"); +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, + ZMOD_RW, "Rotating media load increment for seeking I/Os"); -ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, + ZMOD_RW, "Offset in bytes from the last I/O which triggers " "a reduced rotating media seek increment"); +/* END CSTYLED */ -ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW, - "Non-rotating media load increment for non-seeking I/O's"); +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, + ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os"); -ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW, - "Non-rotating media load increment for seeking I/O's"); -/* END CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, + ZMOD_RW, "Non-rotating media load increment for seeking I/Os"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c index e9145fd012d7..d3580882c3e0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_missing.c +++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -42,7 +42,6 @@ #include <sys/fs/zfs.h> #include <sys/zio.h> -/* ARGSUSED */ static int vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift, uint64_t *pshift) @@ -53,6 +52,7 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we * will fail the GUID sum check before ever trying to open the pool. */ + (void) vd; *psize = 0; *max_psize = 0; *ashift = 0; @@ -60,13 +60,12 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, return (0); } -/* ARGSUSED */ static void vdev_missing_close(vdev_t *vd) { + (void) vd; } -/* ARGSUSED */ static void vdev_missing_io_start(zio_t *zio) { @@ -74,10 +73,10 @@ vdev_missing_io_start(zio_t *zio) zio_execute(zio); } -/* ARGSUSED */ static void vdev_missing_io_done(zio_t *zio) { + (void) zio; } vdev_ops_t vdev_missing_ops = { diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index cc5b15b8c028..092b3f375be0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -121,7 +121,7 @@ * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ -uint32_t zfs_vdev_max_active = 1000; +uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the @@ -141,24 +141,24 @@ uint32_t zfs_vdev_max_active = 1000; * more quickly, but reads and writes to have higher latency and lower * throughput. */ -uint32_t zfs_vdev_sync_read_min_active = 10; -uint32_t zfs_vdev_sync_read_max_active = 10; -uint32_t zfs_vdev_sync_write_min_active = 10; -uint32_t zfs_vdev_sync_write_max_active = 10; -uint32_t zfs_vdev_async_read_min_active = 1; -uint32_t zfs_vdev_async_read_max_active = 3; -uint32_t zfs_vdev_async_write_min_active = 2; -uint32_t zfs_vdev_async_write_max_active = 10; -uint32_t zfs_vdev_scrub_min_active = 1; -uint32_t zfs_vdev_scrub_max_active = 3; -uint32_t zfs_vdev_removal_min_active = 1; -uint32_t zfs_vdev_removal_max_active = 2; -uint32_t zfs_vdev_initializing_min_active = 1; -uint32_t zfs_vdev_initializing_max_active = 1; -uint32_t zfs_vdev_trim_min_active = 1; -uint32_t zfs_vdev_trim_max_active = 2; -uint32_t zfs_vdev_rebuild_min_active = 1; -uint32_t zfs_vdev_rebuild_max_active = 3; +static uint_t zfs_vdev_sync_read_min_active = 10; +static uint_t zfs_vdev_sync_read_max_active = 10; +static uint_t zfs_vdev_sync_write_min_active = 10; +static uint_t zfs_vdev_sync_write_max_active = 10; +static uint_t zfs_vdev_async_read_min_active = 1; +/* */ uint_t zfs_vdev_async_read_max_active = 3; +static uint_t zfs_vdev_async_write_min_active = 2; +/* */ uint_t zfs_vdev_async_write_max_active = 10; +static uint_t zfs_vdev_scrub_min_active = 1; +static uint_t zfs_vdev_scrub_max_active = 3; +static uint_t zfs_vdev_removal_min_active = 1; +static uint_t zfs_vdev_removal_max_active = 2; +static uint_t zfs_vdev_initializing_min_active = 1; +static uint_t zfs_vdev_initializing_max_active = 1; +static uint_t zfs_vdev_trim_min_active = 1; +static uint_t zfs_vdev_trim_max_active = 2; +static uint_t zfs_vdev_rebuild_min_active = 1; +static uint_t zfs_vdev_rebuild_max_active = 3; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -167,8 +167,8 @@ uint32_t zfs_vdev_rebuild_max_active = 3; * zfs_vdev_async_write_max_active. The value is linearly interpolated * between min and max. */ -int zfs_vdev_async_write_active_min_dirty_percent = 30; -int zfs_vdev_async_write_active_max_dirty_percent = 60; +uint_t zfs_vdev_async_write_active_min_dirty_percent = 30; +uint_t zfs_vdev_async_write_active_max_dirty_percent = 60; /* * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), @@ -178,7 +178,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60; * interactive I/O, then the vdev is considered to be "idle", and the number * of concurrently-active non-interactive I/O's is increased to *_max_active. */ -uint_t zfs_vdev_nia_delay = 5; +static uint_t zfs_vdev_nia_delay = 5; /* * Some HDDs tend to prioritize sequential I/O so high that concurrent @@ -190,7 +190,7 @@ uint_t zfs_vdev_nia_delay = 5; * I/Os. This enforced wait ensures the HDD services the interactive I/O * within a reasonable amount of time. */ -uint_t zfs_vdev_nia_credit = 5; +static uint_t zfs_vdev_nia_credit = 5; /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. @@ -198,10 +198,10 @@ uint_t zfs_vdev_nia_credit = 5; * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -int zfs_vdev_aggregation_limit = 1 << 20; -int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; -int zfs_vdev_read_gap_limit = 32 << 10; -int zfs_vdev_write_gap_limit = 4 << 10; +static uint_t zfs_vdev_aggregation_limit = 1 << 20; +static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; +static uint_t zfs_vdev_read_gap_limit = 32 << 10; +static uint_t zfs_vdev_write_gap_limit = 4 << 10; /* * Define the queue depth percentage for each top-level. This percentage is @@ -214,9 +214,9 @@ int zfs_vdev_write_gap_limit = 4 << 10; * to 30 allocations per device. */ #ifdef _KERNEL -int zfs_vdev_queue_depth_pct = 1000; +uint_t zfs_vdev_queue_depth_pct = 1000; #else -int zfs_vdev_queue_depth_pct = 300; +uint_t zfs_vdev_queue_depth_pct = 300; #endif /* @@ -226,14 +226,7 @@ int zfs_vdev_queue_depth_pct = 300; * we assume that the average allocation size is 4k, so we need the queue depth * to be 32 per allocator to get good aggregation of sequential writes. */ -int zfs_vdev_def_queue_depth = 32; - -/* - * Allow TRIM I/Os to be aggregated. This should normally not be needed since - * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted - * by the TRIM code in zfs_trim.c. - */ -int zfs_vdev_aggregate_trim = 0; +uint_t zfs_vdev_def_queue_depth = 32; static int vdev_queue_offset_compare(const void *x1, const void *x2) @@ -249,39 +242,64 @@ vdev_queue_offset_compare(const void *x1, const void *x2) return (TREE_PCMP(z1, z2)); } -static inline avl_tree_t * -vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) -{ - return (&vq->vq_class[p].vqc_queued_tree); -} - -static inline avl_tree_t * -vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) -{ - ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); - if (t == ZIO_TYPE_READ) - return (&vq->vq_read_offset_tree); - else if (t == ZIO_TYPE_WRITE) - return (&vq->vq_write_offset_tree); - else - return (&vq->vq_trim_offset_tree); -} +#define VDQ_T_SHIFT 29 static int -vdev_queue_timestamp_compare(const void *x1, const void *x2) +vdev_queue_to_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); + int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, + z2->io_timestamp >> VDQ_T_SHIFT); + int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); + int cmp = tcmp ? tcmp : ocmp; - if (likely(cmp)) + if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) return (cmp); return (TREE_PCMP(z1, z2)); } -static int +static inline boolean_t +vdev_queue_class_fifo(zio_priority_t p) +{ + return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_TRIM); +} + +static void +vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + vq->vq_cqueued |= 1U << p; + if (vdev_queue_class_fifo(p)) { + list_insert_tail(&vq->vq_class[p].vqc_list, zio); + vq->vq_class[p].vqc_list_numnodes++; + } + else + avl_add(&vq->vq_class[p].vqc_tree, zio); +} + +static void +vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + uint32_t empty; + if (vdev_queue_class_fifo(p)) { + list_t *list = &vq->vq_class[p].vqc_list; + list_remove(list, zio); + empty = list_is_empty(list); + vq->vq_class[p].vqc_list_numnodes--; + } else { + avl_tree_t *tree = &vq->vq_class[p].vqc_tree; + avl_remove(tree, zio); + empty = avl_is_empty(tree); + } + vq->vq_cqueued &= ~(empty << p); +} + +static uint_t vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { @@ -313,10 +331,10 @@ vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) } } -static int +static uint_t vdev_queue_max_async_writes(spa_t *spa) { - int writes; + uint_t writes; uint64_t dirty = 0; dsl_pool_t *dp = spa_get_dsl(spa); uint64_t min_bytes = zfs_dirty_data_max * @@ -359,8 +377,8 @@ vdev_queue_max_async_writes(spa_t *spa) return (writes); } -static int -vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) +static uint_t +vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -370,7 +388,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes(spa)); + return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, @@ -408,16 +426,16 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) } /* - * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if + * Return the i/o class to issue from, or ZIO_PRIORITY_NUM_QUEUEABLE if * there is no eligible class. */ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { - spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p, n; + uint32_t cq = vq->vq_cqueued; + zio_priority_t p, p1; - if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* @@ -425,14 +443,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ - for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { - p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(vq, p)) { - vq->vq_last_prio = p; - return (p); - } + p1 = vq->vq_last_prio + 1; + if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE) + p1 = 0; + for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; + } + for (p = 0; p < p1; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; } /* @@ -440,16 +462,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, vq, p)) { - vq->vq_last_prio = p; - return (p); - } + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_max_active(vq, p)) + break; } - /* No eligible queued i/os */ - return (ZIO_PRIORITY_NUM_QUEUEABLE); +found: + vq->vq_last_prio = p; + return (p); } void @@ -458,42 +478,30 @@ vdev_queue_init(vdev_t *vd) vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; - taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); - - avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - int (*compfn) (const void *, const void *); - - /* - * The synchronous/trim i/o queues are dispatched in FIFO rather - * than LBA order. This provides more consistent latency for - * these i/os. - */ - if (p == ZIO_PRIORITY_SYNC_READ || - p == ZIO_PRIORITY_SYNC_WRITE || - p == ZIO_PRIORITY_TRIM) { - compfn = vdev_queue_timestamp_compare; + if (vdev_queue_class_fifo(p)) { + list_create(&vq->vq_class[p].vqc_list, + sizeof (zio_t), + offsetof(struct zio, io_queue_node.l)); } else { - compfn = vdev_queue_offset_compare; + avl_create(&vq->vq_class[p].vqc_tree, + vdev_queue_to_compare, sizeof (zio_t), + offsetof(struct zio, io_queue_node.a)); } - avl_create(vdev_queue_class_tree(vq, p), compfn, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + avl_create(&vq->vq_read_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(&vq->vq_write_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); vq->vq_last_offset = 0; + list_create(&vq->vq_active_list, sizeof (struct zio), + offsetof(struct zio, io_queue_node.l)); + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -501,30 +509,39 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(vdev_queue_class_tree(vq, p)); - avl_destroy(&vq->vq_active_tree); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (vdev_queue_class_fifo(p)) + list_destroy(&vq->vq_class[p].vqc_list); + else + avl_destroy(&vq->vq_class[p].vqc_tree); + } + avl_destroy(&vq->vq_read_offset_tree); + avl_destroy(&vq->vq_write_offset_tree); + list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); + zio->io_queue_state = ZIO_QS_QUEUED; + vdev_queue_class_add(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_add(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_add(&vq->vq_write_offset_tree, zio); } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); + vdev_queue_class_remove(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_remove(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_remove(&vq->vq_write_offset_tree, zio); + zio->io_queue_state = ZIO_QS_NONE; } static boolean_t @@ -546,14 +563,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active++; + vq->vq_cactive[zio->io_priority]++; + vq->vq_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } - avl_add(&vq->vq_active_tree, zio); + zio->io_queue_state = ZIO_QS_ACTIVE; + list_insert_tail(&vq->vq_active_list, zio); } static void @@ -561,7 +580,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active--; + vq->vq_cactive[zio->io_priority]--; + vq->vq_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; @@ -569,7 +589,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; - avl_remove(&vq->vq_active_tree, zio); + list_remove(&vq->vq_active_list, zio); + zio->io_queue_state = ZIO_QS_NONE; } static void @@ -602,29 +623,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) uint64_t maxgap = 0; uint64_t size; uint64_t limit; - int maxblocksize; boolean_t stretch = B_FALSE; - avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); - enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; + avl_tree_t *t; + + /* + * TRIM aggregation should not be needed since code in zfs_trim.c can + * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M). + */ + if (zio->io_type == ZIO_TYPE_TRIM) + return (NULL); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) + return (NULL); - maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; - limit = MAX(MIN(limit, maxblocksize), 0); - - if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) - return (NULL); - - /* - * While TRIM commands could be aggregated based on offset this - * behavior is disabled until it's determined to be beneficial. - */ - if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) + if (limit == 0) return (NULL); + limit = MIN(limit, SPA_MAXBLOCKSIZE); /* * I/Os to distributed spares are directly dispatched to the dRAID @@ -635,8 +655,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) first = last = zio; - if (zio->io_type == ZIO_TYPE_READ) + if (zio->io_type == ZIO_TYPE_READ) { maxgap = zfs_vdev_read_gap_limit; + t = &vq->vq_read_offset_tree; + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + t = &vq->vq_write_offset_tree; + } /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -657,6 +682,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ + zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && @@ -686,7 +712,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_SPAN(first, dio) <= maxblocksize && + IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; @@ -725,6 +751,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * after our span is mandatory. */ dio = AVL_NEXT(t, last); + ASSERT3P(dio, !=, NULL); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { /* do not include the optional i/o */ @@ -739,7 +766,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) return (NULL); size = IO_SPAN(first, last); - ASSERT3U(size, <=, maxblocksize); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); abd = abd_alloc_gang(); if (abd == NULL) @@ -747,8 +774,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); + flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; @@ -756,6 +782,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) do { dio = nio; nio = AVL_NEXT(t, dio); + ASSERT3P(dio, !=, NULL); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); @@ -823,19 +850,30 @@ again: return (NULL); } - /* - * For LBA-ordered queues (async / scrub / initializing), issue the - * i/o which follows the most recently issued i/o in LBA (offset) order. - * - * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. - */ - tree = vdev_queue_class_tree(vq, p); - vq->vq_io_search.io_timestamp = 0; - vq->vq_io_search.io_offset = vq->vq_last_offset - 1; - VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); - zio = avl_nearest(tree, idx, AVL_AFTER); - if (zio == NULL) - zio = avl_first(tree); + if (vdev_queue_class_fifo(p)) { + zio = list_head(&vq->vq_class[p].vqc_list); + } else { + /* + * For LBA-ordered queues (async / scrub / initializing), + * issue the I/O which follows the most recently issued I/O + * in LBA (offset) order, but to avoid starvation only within + * the same 0.5 second interval as the first I/O. + */ + tree = &vq->vq_class[p].vqc_tree; + zio = aio = avl_first(tree); + if (zio->io_offset < vq->vq_last_offset) { + vq->vq_io_search.io_timestamp = zio->io_timestamp; + vq->vq_io_search.io_offset = vq->vq_last_offset; + zio = avl_find(tree, &vq->vq_io_search, &idx); + if (zio == NULL) { + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL || + (zio->io_timestamp >> VDQ_T_SHIFT) != + (aio->io_timestamp >> VDQ_T_SHIFT)) + zio = aio; + } + } + } ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); @@ -905,7 +943,7 @@ vdev_queue_io(zio_t *zio) ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); @@ -966,7 +1004,6 @@ void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio @@ -1001,12 +1038,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * Otherwise, the zio is currently active and we cannot change its * priority. */ - tree = vdev_queue_class_tree(vq, zio->io_priority); - if (avl_find(tree, zio, NULL) == zio) { - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + if (zio->io_queue_state == ZIO_QS_QUEUED) { + vdev_queue_class_remove(vq, zio); zio->io_priority = priority; - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + vdev_queue_class_add(vq, zio); + } else if (zio->io_queue_state == ZIO_QS_NONE) { zio->io_priority = priority; } @@ -1019,10 +1055,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ -int +uint32_t vdev_queue_length(vdev_t *vd) { - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); + return (vd->vdev_queue.vq_active); } uint64_t @@ -1031,91 +1067,99 @@ vdev_queue_last_offset(vdev_t *vd) return (vd->vdev_queue.vq_last_offset); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW, - "Max vdev I/O aggregation size"); +uint64_t +vdev_queue_class_length(vdev_t *vd, zio_priority_t p) +{ + vdev_queue_t *vq = &vd->vdev_queue; + if (vdev_queue_class_fifo(p)) + return (vq->vq_class[p].vqc_list_numnodes); + else + return (avl_numnodes(&vq->vq_class[p].vqc_tree)); +} -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW, - "Max vdev I/O aggregation size for non-rotating media"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, + "Max vdev I/O aggregation size"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW, - "Allow TRIM I/O to be aggregated"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, + ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, "Aggregate read I/O over gap"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, UINT, ZMOD_RW, "Aggregate write I/O over gap"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, UINT, ZMOD_RW, "Maximum number of active I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW, - "Async write concurrency max threshold"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, + UINT, ZMOD_RW, "Async write concurrency max threshold"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW, - "Async write concurrency min threshold"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, + UINT, ZMOD_RW, "Async write concurrency min threshold"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, UINT, ZMOD_RW, "Max active async read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, UINT, ZMOD_RW, "Min active async read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, UINT, ZMOD_RW, "Max active async write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, UINT, ZMOD_RW, "Min active async write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, UINT, ZMOD_RW, "Max active initializing I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, UINT, ZMOD_RW, "Min active initializing I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, UINT, ZMOD_RW, "Max active removal I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, UINT, ZMOD_RW, "Min active removal I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, UINT, ZMOD_RW, "Max active scrub I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, UINT, ZMOD_RW, "Min active scrub I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, UINT, ZMOD_RW, "Max active sync read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, UINT, ZMOD_RW, "Min active sync read I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, UINT, ZMOD_RW, "Max active sync write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, UINT, ZMOD_RW, "Min active sync write I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, UINT, ZMOD_RW, "Max active trim/discard I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, UINT, ZMOD_RW, "Min active trim/discard I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, UINT, ZMOD_RW, "Max active rebuild I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, UINT, ZMOD_RW, "Min active rebuild I/Os per vdev"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW, "Number of non-interactive I/Os to allow in sequence"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW, "Number of non-interactive I/Os before _max_active"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); -/* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, + "Default queue depth for each allocator"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 7e7202ec1e55..15c8b8ca6016 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -27,15 +27,22 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> #include <sys/vdev_impl.h> +#include <sys/metaslab_impl.h> #include <sys/zio.h> #include <sys/zio_checksum.h> +#include <sys/dmu_tx.h> #include <sys/abd.h> +#include <sys/zfs_rlock.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> #include <sys/vdev_raidz.h> #include <sys/vdev_raidz_impl.h> #include <sys/vdev_draid.h> +#include <sys/uberblock_impl.h> +#include <sys/dsl_scan.h> #ifdef ZFS_DEBUG #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -135,6 +142,237 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } + +/* + * Big Theory Statement for how a RAIDZ VDEV is expanded + * + * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion + * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs + * that have been previously expanded can be expanded again. + * + * The RAIDZ VDEV must be healthy (must be able to write to all the drives in + * the VDEV) when an expansion starts. And the expansion will pause if any + * disk in the VDEV fails, and resume once the VDEV is healthy again. All other + * operations on the pool can continue while an expansion is in progress (e.g. + * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, + * and zpool initialize which can't be run during an expansion. Following a + * reboot or export/import, the expansion resumes where it left off. + * + * == Reflowing the Data == + * + * The expansion involves reflowing (copying) the data from the current set + * of disks to spread it across the new set which now has one more disk. This + * reflow operation is similar to reflowing text when the column width of a + * text editor window is expanded. The text doesn’t change but the location of + * the text changes to accommodate the new width. An example reflow result for + * a 4-wide RAIDZ1 to a 5-wide is shown below. + * + * Reflow End State + * Each letter indicates a parity group (logical stripe) + * + * Before expansion After Expansion + * D1 D2 D3 D4 D1 D2 D3 D4 D5 + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | A | A | A | A | | A | A | A | A | B | + * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | B | B | C | C | | B | C | C | C | C | + * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | C | C | D | D | | D | D | E | E | E | + * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | E | E | E | E | --> | E | F | F | G | G | + * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | F | F | G | G | | G | G | H | H | H | + * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | G | G | H | H | | H | I | I | J | J | + * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | H | H | I | I | | J | J | | | K | + * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| + * +------+------+------+------+ +------+------+------+------+------+ + * + * This reflow approach has several advantages. There is no need to read or + * modify the block pointers or recompute any block checksums. The reflow + * doesn’t need to know where the parity sectors reside. We can read and write + * data sequentially and the copy can occur in a background thread in open + * context. The design also allows for fast discovery of what data to copy. + * + * The VDEV metaslabs are processed, one at a time, to copy the block data to + * have it flow across all the disks. The metaslab is disabled for allocations + * during the copy. As an optimization, we only copy the allocated data which + * can be determined by looking at the metaslab range tree. During the copy we + * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still + * need to be able to survive losing parity count disks). This means we + * cannot overwrite data during the reflow that would be needed if a disk is + * lost. + * + * After the reflow completes, all newly-written blocks will have the new + * layout, i.e., they will have the parity to data ratio implied by the new + * number of disks in the RAIDZ group. Even though the reflow copies all of + * the allocated space (data and parity), it is only rearranged, not changed. + * + * This act of reflowing the data has a few implications about blocks + * that were written before the reflow completes: + * + * - Old blocks will still use the same amount of space (i.e., they will have + * the parity to data ratio implied by the old number of disks in the RAIDZ + * group). + * - Reading old blocks will be slightly slower than before the reflow, for + * two reasons. First, we will have to read from all disks in the RAIDZ + * VDEV, rather than being able to skip the children that contain only + * parity of this block (because the data of a single block is now spread + * out across all the disks). Second, in most cases there will be an extra + * bcopy, needed to rearrange the data back to its original layout in memory. + * + * == Scratch Area == + * + * As we copy the block data, we can only progress to the point that writes + * will not overlap with blocks whose progress has not yet been recorded on + * disk. Since partially-copied rows are always read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent any + * row-wise overlap. For example, in the diagram above, when we reflow sector + * B6 it will overwite the original location for B5. + * + * To get around this, a scratch space is used so that we can start copying + * without risking data loss by overlapping the row. As an added benefit, it + * improves performance at the beginning of the reflow, but that small perf + * boost wouldn't be worth the complexity on its own. + * + * Ideally we want to copy at least 2 * (new_width)^2 so that we have a + * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max + * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice + * the widths will likely be single digits so we can get a substantial chuck + * size using only a few MB of scratch per disk. + * + * The scratch area is persisted to disk which holds a large amount of reflowed + * state. We can always read the partially written stripes when a disk fails or + * the copy is interrupted (crash) during the initial copying phase and also + * get past a small chunk size restriction. At a minimum, the scratch space + * must be large enough to get us to the point that one row does not overlap + * itself when moved (i.e new_width^2). But going larger is even better. We + * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels + * as our scratch space to handle overwriting the initial part of the VDEV. + * + * 0 256K 512K 4M + * +------+------+-----------------------+----------------------------- + * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... + * | L0 | L1 | Reserved | (Metaslabs) + * +------+------+-----------------------+------------------------------- + * Scratch Area + * + * == Reflow Progress Updates == + * After the initial scratch-based reflow, the expansion process works + * similarly to device removal. We create a new open context thread which + * reflows the data, and periodically kicks off sync tasks to update logical + * state. In this case, state is the committed progress (offset of next data + * to copy). We need to persist the completed offset on disk, so that if we + * crash we know which format each VDEV offset is in. + * + * == Time Dependent Geometry == + * + * In non-expanded RAIDZ, blocks are read from disk in a column by column + * fashion. For a multi-row block, the second sector is in the first column + * not in the second column. This allows us to issue full reads for each + * column directly into the request buffer. The block data is thus laid out + * sequentially in a column-by-column fashion. + * + * For example, in the before expansion diagram above, one logical block might + * be sectors G19-H26. The parity is in G19,H23; and the data is in + * G20,H24,G21,H25,G22,H26. + * + * After a block is reflowed, the sectors that were all in the original column + * data can now reside in different columns. When reading from an expanded + * VDEV, we need to know the logical stripe width for each block so we can + * reconstitute the block’s data after the reads are completed. Likewise, + * when we perform the combinatorial reconstruction we need to know the + * original width so we can retry combinations from the past layouts. + * + * Time dependent geometry is what we call having blocks with different layouts + * (stripe widths) in the same VDEV. This time-dependent geometry uses the + * block’s birth time (+ the time expansion ended) to establish the correct + * width for a given block. After an expansion completes, we record the time + * for blocks written with a particular width (geometry). + * + * == On Disk Format Changes == + * + * New pool feature flag, 'raidz_expansion' whose reference count is the number + * of RAIDZ VDEVs that have been expanded. + * + * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. + * + * Since the uberblock can point to arbitrary blocks, which might be on the + * expanding RAIDZ, and might or might not have been expanded. We need to know + * which way a block is laid out before reading it. This info is the next + * offset that needs to be reflowed and we persist that in the uberblock, in + * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. + * After the expansion is complete, we then use the raidz_expand_txgs array + * (see below) to determine how to read a block and the ub_raidz_reflow_info + * field no longer required. + * + * The uberblock's ub_raidz_reflow_info field also holds the scratch space + * state (i.e., active or not) which is also required before reading a block + * during the initial phase of reflowing the data. + * + * The top-level RAIDZ VDEV has two new entries in the nvlist: + * + * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here + * and used after the expansion is complete to + * determine how to read a raidz block + * 'raidz_expanding' boolean: present during reflow and removed after completion + * used during a spa import to resume an unfinished + * expansion + * + * And finally the VDEVs top zap adds the following informational entries: + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE + * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME + * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME + * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED + */ + +/* + * For testing only: pause the raidz expansion after reflowing this amount. + * (accessed by ZTS and ztest) + */ +#ifdef _KERNEL +static +#endif /* _KERNEL */ +unsigned long raidz_expand_max_reflow_bytes = 0; + +/* + * For testing only: pause the raidz expansion at a certain point. + */ +uint_t raidz_expand_pause_point = 0; + +/* + * Maximum amount of copy io's outstanding at once. + */ +static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; + +/* + * Apply raidz map abds aggregation if the number of rows in the map is equal + * or greater than the value below. + */ +static unsigned long raidz_io_aggregate_rows = 4; + +/* + * Automatically start a pool scrub when a RAIDZ expansion completes in + * order to verify the checksums of all blocks which have been copied + * during the expansion. Automatic scrubbing is enabled by default and + * is strongly recommended. + */ +static int zfs_scrub_after_expand = 1; + static void vdev_raidz_row_free(raidz_row_t *rr) { @@ -159,6 +397,17 @@ vdev_raidz_map_free(raidz_map_t *rm) for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); + if (rm->rm_nphys_cols) { + for (int i = 0; i < rm->rm_nphys_cols; i++) { + if (rm->rm_phys_col[i].rc_abd != NULL) + abd_free(rm->rm_phys_col[i].rc_abd); + } + + kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * + rm->rm_nphys_cols); + } + + ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } @@ -170,10 +419,37 @@ vdev_raidz_map_free_vsd(zio_t *zio) vdev_raidz_map_free(rm); } +static int +vdev_raidz_reflow_compare(const void *x1, const void *x2) +{ + const reflow_node_t *l = x1; + const reflow_node_t *r = x2; + + return (TREE_CMP(l->re_txg, r->re_txg)); +} + const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; +raidz_row_t * +vdev_raidz_row_alloc(int cols) +{ + raidz_row_t *rr = + kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); + + rr->rr_cols = cols; + rr->rr_scols = cols; + + for (int c = 0; c < cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_shadow_devidx = INT_MAX; + rc->rc_shadow_offset = UINT64_MAX; + rc->rc_allow_repair = 1; + } + return (rr); +} + static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { @@ -302,7 +578,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; - uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t acols, scols; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); @@ -312,22 +588,22 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ - q = s / (dcols - nparity); + uint64_t q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ - r = s - q * (dcols - nparity); + uint64_t r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); + uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); + uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* * acols: The columns that will be accessed. @@ -343,43 +619,28 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } ASSERT3U(acols, <=, scols); - - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rr = vdev_raidz_row_alloc(scols); rm->rm_row[0] = rr; - rr->rr_cols = acols; - rr->rr_scols = scols; rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif - asize = 0; + uint64_t asize = 0; - for (c = 0; c < scols; c++) { + for (uint64_t c = 0; c < scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - col = f + c; - coff = o; + uint64_t col = f + c; + uint64_t coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rc->rc_devidx = col; rc->rc_offset = coff; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (c >= acols) rc->rc_size = 0; @@ -419,13 +680,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rr->rr_col[0].rc_devidx; + uint64_t devidx = rr->rr_col[0].rc_devidx; o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; - if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } @@ -435,7 +695,338 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } else { vdev_raidz_map_alloc_read(zio, rm); } + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + +/* + * Everything before reflow_offset_synced should have been moved to the new + * location (read and write completed). However, this may not yet be reflected + * in the on-disk format (e.g. raidz_reflow_sync() has been called but the + * uberblock has not yet been written). If reflow is not in progress, + * reflow_offset_synced should be UINT64_MAX. For each row, if the row is + * entirely before reflow_offset_synced, it will come from the new location. + * Otherwise this row will come from the old location. Therefore, rows that + * straddle the reflow_offset_synced will come from the old location. + * + * For writes, reflow_offset_next is the next offset to copy. If a sector has + * been copied, but not yet reflected in the on-disk progress + * (reflow_offset_synced), it will also be written to the new (already copied) + * offset. + */ +noinline raidz_map_t * +vdev_raidz_map_alloc_expanded(zio_t *zio, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset_synced, + uint64_t reflow_offset_next, boolean_t use_scratch) +{ + abd_t *abd = zio->io_abd; + uint64_t offset = zio->io_offset; + uint64_t size = zio->io_size; + + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + uint64_t q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + rm->rm_skipstart = bc; + uint64_t asize = 0; + + for (uint64_t row = 0; row < rows; row++) { + boolean_t row_use_scratch = B_FALSE; + raidz_row_t *rr = vdev_raidz_row_alloc(cols); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and the copying has + * not yet completed for any part of this row, then use the + * old location of this row. Note that reflow_offset_synced + * reflects the i/o that's been completed, because it's + * updated by a synctask, after zio_wait(spa_txg_zio[]). + * This is sufficient for our check, even if that progress + * has not yet been recorded to disk (reflected in + * spa_ubsync). Also note that we consider the last row to + * be "full width" (`cols`-wide rather than `bc`-wide) for + * this calculation. This causes a tiny bit of unnecessary + * double-writes but is safe and simpler to calculate. + */ + int row_phys_cols = physical_cols; + if (b + cols > reflow_offset_synced >> ashift) + row_phys_cols--; + else if (use_scratch) + row_use_scratch = B_TRUE; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * Note, rr_cols is the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_firstdatacol = nparity; +#ifdef ZFS_DEBUG + /* + * note: rr_size is PSIZE, not ASIZE + */ + rr->rr_offset = b << ashift; + rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; +#endif + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_devidx = child_id; + rc->rc_offset = child_offset; + + /* + * Get this from the scratch space if appropriate. + * This only happens if we crashed in the middle of + * raidz_reflow_scratch_sync() (while it's running, + * the rangelock prevents us from doing concurrent + * io), and even then only during zpool import or + * when the pool is imported readonly. + */ + if (row_use_scratch) + rc->rc_offset -= VDEV_BOOT_SIZE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rc->rc_size = 1ULL << ashift; + + /* + * Parity sectors' rc_abd's are set below + * after determining if this is an aggregation. + */ + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end of the block (even including + * skip sectors). This sector is part of the + * map so that we have full rows for p/q parity + * generation. + */ + rc->rc_size = 0; + rc->rc_abd = NULL; + } else { + /* "data column" (col excluding parity) */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + rc->rc_size = 1ULL << ashift; + rc->rc_abd = abd_get_offset_struct( + &rc->rc_abdstruct, abd, off << ashift, + rc->rc_size); + } + + if (rc->rc_size == 0) + continue; + + /* + * If any part of this row is in both old and new + * locations, the primary location is the old + * location. If this sector was already copied to the + * new location, we need to also write to the new, + * "shadow" location. + * + * Note, `row_phys_cols != physical_cols` indicates + * that the primary location is the old location. + * `b+c < reflow_offset_next` indicates that the copy + * to the new location has been initiated. We know + * that the copy has completed because we have the + * rangelock, which is held exclusively while the + * copy is in progress. + */ + if (row_use_scratch || + (row_phys_cols != physical_cols && + b + c < reflow_offset_next >> ashift)) { + rc->rc_shadow_devidx = (b + c) % physical_cols; + rc->rc_shadow_offset = + ((b + c) / physical_cols) << ashift; + if (row_use_scratch) + rc->rc_shadow_offset -= VDEV_BOOT_SIZE; + } + + asize += rc->rc_size; + } + + /* + * See comment in vdev_raidz_map_alloc() + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + + int devidx0 = rr->rr_col[0].rc_devidx; + uint64_t offset0 = rr->rr_col[0].rc_offset; + int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; + uint64_t shadow_offset0 = + rr->rr_col[0].rc_shadow_offset; + + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[0].rc_shadow_devidx = + rr->rr_col[1].rc_shadow_devidx; + rr->rr_col[0].rc_shadow_offset = + rr->rr_col[1].rc_shadow_offset; + + rr->rr_col[1].rc_devidx = devidx0; + rr->rr_col[1].rc_offset = offset0; + rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; + rr->rr_col[1].rc_shadow_offset = shadow_offset0; + } + } + ASSERT3U(asize, ==, tot << ashift); + + /* + * Determine if the block is contiguous, in which case we can use + * an aggregation. + */ + if (rows >= raidz_io_aggregate_rows) { + rm->rm_nphys_cols = physical_cols; + rm->rm_phys_col = + kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, + KM_SLEEP); + + /* + * Determine the aggregate io's offset and size, and check + * that the io is contiguous. + */ + for (int i = 0; + i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + + if (rc->rc_size == 0) + continue; + + if (prc->rc_size == 0) { + ASSERT0(prc->rc_offset); + prc->rc_offset = rc->rc_offset; + } else if (prc->rc_offset + prc->rc_size != + rc->rc_offset) { + /* + * This block is not contiguous and + * therefore can't be aggregated. + * This is expected to be rare, so + * the cost of allocating and then + * freeing rm_phys_col is not + * significant. + */ + kmem_free(rm->rm_phys_col, + sizeof (raidz_col_t) * + rm->rm_nphys_cols); + rm->rm_phys_col = NULL; + rm->rm_nphys_cols = 0; + break; + } + prc->rc_size += rc->rc_size; + } + } + } + if (rm->rm_phys_col != NULL) { + /* + * Allocate aggregate ABD's. + */ + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + + prc->rc_devidx = i; + + if (prc->rc_size == 0) + continue; + + prc->rc_abd = + abd_alloc_linear(rm->rm_phys_col[i].rc_size, + B_FALSE); + } + /* + * Point the parity abd's into the aggregate abd's. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_abd = + abd_get_offset_struct(&rc->rc_abdstruct, + prc->rc_abd, + rc->rc_offset - prc->rc_offset, + rc->rc_size); + } + } + } else { + /* + * Allocate new abd's for the parity sectors. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = + abd_alloc_linear(rc->rc_size, + B_TRUE); + } + } + } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -453,11 +1044,11 @@ vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++) + for (int i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); @@ -469,11 +1060,11 @@ vdev_raidz_pq_func(void *buf, size_t size, void *private) struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; @@ -488,11 +1079,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; @@ -618,7 +1209,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { - ASSERT3U(rr->rr_cols, !=, 0); + if (rr->rr_cols == 0) { + /* + * We are handling this block one row at a time (because + * this block has a different logical vs physical width, + * due to RAIDZ expansion), and this is a pad-only row, + * which has no parity. + */ + return; + } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) @@ -648,10 +1247,10 @@ vdev_raidz_generate_parity(raidz_map_t *rm) } } -/* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) { + (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; int cnt = size / sizeof (src[0]); @@ -663,11 +1262,11 @@ vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) return (0); } -/* ARGSUSED */ static int vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, void *private) { + (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; uint64_t mask; @@ -681,10 +1280,10 @@ vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, return (0); } -/* ARGSUSED */ static int vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) { + (void) private; uint64_t *dst = buf; uint64_t mask; int cnt = size / sizeof (dst[0]); @@ -770,6 +1369,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) int x = tgts[0]; abd_t *dst, *src; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); + ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); @@ -802,6 +1404,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) int c, exp; abd_t *dst, *src; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); + ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); @@ -848,6 +1453,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) int y = tgts[1]; abd_t *xd, *yd; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); @@ -926,7 +1534,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; } -/* BEGIN CSTYLED */ /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coefficients used to generate parity as well as @@ -1078,7 +1685,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ -/* END CSTYLED */ static void vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, @@ -1285,8 +1891,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { - int n, i, c, t, tt; - int nmissing_rows; + int i, c, t, tt; + unsigned int n; + unsigned int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; @@ -1297,11 +1904,14 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) abd_t **bufs = NULL; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. */ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + ASSERT(rr->rr_col[i].rc_abd != NULL); if (!abd_is_linear(rr->rr_col[i].rc_abd)) { bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); @@ -1429,10 +2039,23 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", + rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, + (int)rr->rr_missingparity); + } + nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " + "offset=%llx error=%u)", + rr, c, (int)rr->rr_col[c].rc_devidx, + (long long)rr->rr_col[c].rc_offset, + (int)rr->rr_col[c].rc_error); + } if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; @@ -1529,12 +2152,25 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); } + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); + } + + if (vd->vdev_rz_expanding) { + *asize *= vd->vdev_children - 1; + *max_asize *= vd->vdev_children - 1; - *asize *= vd->vdev_children; - *max_asize *= vd->vdev_children; + vd->vdev_min_asize = *asize; + } else { + *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; + } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; @@ -1553,19 +2189,70 @@ vdev_raidz_close(vdev_t *vd) } } +/* + * Return the logical width to use, given the txg in which the allocation + * happened. Note that BP_GET_BIRTH() is usually the txg in which the + * BP was allocated. Remapped BP's (that were relocated due to device + * removal, see remap_blkptr_cb()), will have a more recent physical birth + * which reflects when the BP was relocated, but we can ignore these because + * they can't be on RAIDZ (device removal doesn't support RAIDZ). + */ static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) +{ + reflow_node_t lookup = { + .re_txg = txg, + }; + avl_index_t where; + + uint64_t width; + mutex_enter(&vdrz->vd_expand_lock); + reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); + if (re != NULL) { + width = re->re_logical_width; + } else { + re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); + if (re != NULL) + width = re->re_logical_width; + else + width = vdrz->vd_original_width; + } + mutex_exit(&vdrz->vd_expand_lock); + return (width); +} + +/* + * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated + * more space due to the lower data-to-parity ratio. In this case it's + * important to pass in the correct txg. Note that vdev_gang_header_asize() + * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, + * regardless of txg. This is assured because for a single data sector, we + * allocate P+1 sectors regardless of width ("cols", which is at least P+1). + */ +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_logical_width; + uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; + cols = vdev_raidz_get_logical_width(vdrz, txg); + asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; +#ifdef ZFS_DEBUG + uint64_t asize_new = ((psize - 1) >> ashift) + 1; + uint64_t ncols_new = vdrz->vd_physical_width; + asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / + (ncols_new - nparity)); + asize_new = roundup(asize_new, nparity + 1) << ashift; + VERIFY3U(asize_new, <=, asize); +#endif + return (asize); } @@ -1592,21 +2279,37 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +vdev_raidz_shadow_child_done(zio_t *zio) { -#ifdef ZFS_DEBUG - vdev_t *tvd = vd->vdev_top; + raidz_col_t *rc = zio->io_private; + + rc->rc_shadow_error = zio->io_error; +} +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) +{ + (void) rm; +#ifdef ZFS_DEBUG range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(vd, rr->rr_size); + vdev_raidz_asize(zio->io_vd, rr->rr_size, + BP_GET_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; - vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); + if (vdev_xlate_is_empty(&physical_rs)) { + /* + * If we are in the middle of expansion, the + * physical->logical mapping is changing so vdev_xlate() + * can't give us a reliable answer. + */ + return; + } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1617,7 +2320,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + - rc->rc_size + (1 << tvd->vdev_ashift)); + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } @@ -1625,7 +2328,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) } static void -vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; @@ -1637,31 +2340,66 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ - vdev_raidz_io_verify(vd, rr, c); + vdev_raidz_io_verify(zio, rm, rr, c); - if (rc->rc_size > 0) { - ASSERT3P(rc->rc_abd, !=, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, - abd_get_size(rc->rc_abd), zio->io_type, - zio->io_priority, 0, vdev_raidz_child_done, rc)); - } else { - /* - * Generate optional write for skip sector to improve - * aggregation contiguity. - */ - ASSERT3P(rc->rc_abd, ==, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, NULL, 1ULL << ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, - NULL)); + if (rc->rc_size == 0) + continue; + + ASSERT3U(rc->rc_offset + rc->rc_size, <, + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + ASSERT3P(rc->rc_abd, !=, NULL); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), zio->io_type, + zio->io_priority, 0, vdev_raidz_child_done, rc)); + + if (rc->rc_shadow_devidx != INT_MAX) { + vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + + ASSERT3U( + rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, + cvd2->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, + rc->rc_shadow_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), + zio->io_type, zio->io_priority, 0, + vdev_raidz_shadow_child_done, rc)); } } } +/* + * Generate optional I/Os for skip sectors to improve aggregation contiguity. + * This only works for vdev_raidz_map_alloc() (not _expanded()). + */ +static void +raidz_start_skip_writes(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + raidz_map_t *rm = zio->io_vsd; + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + for (int c = 0; c < rr->rr_scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (rc->rc_size != 0) + continue; + ASSERT3P(rc->rc_abd, ==, NULL); + + ASSERT3U(rc->rc_offset, <, + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, + NULL, 1ULL << ashift, zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } +} + static void -vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; @@ -1693,7 +2431,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_skipped = 1; continue; } - if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + if (forceparity || + c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -1703,6 +2442,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) } } +static void +vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) +{ + vdev_t *vd = zio->io_vd; + + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + if (prc->rc_size == 0) + continue; + + ASSERT3U(prc->rc_devidx, ==, i); + vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { + prc->rc_error = SET_ERROR(ENXIO); + prc->rc_tried = 1; /* don't even try */ + prc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + prc->rc_error = SET_ERROR(ESTALE); + prc->rc_skipped = 1; + continue; + } + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + prc->rc_offset, prc->rc_abd, prc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, prc)); + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) +{ + /* + * If there are multiple rows, we will be hitting + * all disks, so go ahead and read the parity so + * that we are reading in decent size chunks. + */ + boolean_t forceparity = rm->rm_nrows > 1; + + if (rm->rm_phys_col) { + vdev_raidz_io_start_read_phys_cols(zio, rm); + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_start_read_row(zio, rr, forceparity); + } + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1726,24 +2515,83 @@ vdev_raidz_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, + BP_GET_BIRTH(zio->io_bp)); + if (logical_width != vdrz->vd_physical_width) { + zfs_locked_range_t *lr = NULL; + uint64_t synced_offset = UINT64_MAX; + uint64_t next_offset = UINT64_MAX; + boolean_t use_scratch = B_FALSE; + /* + * Note: when the expansion is completing, we set + * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) + * in a later txg than when we last update spa_ubsync's state + * (see the end of spa_raidz_expand_thread()). Therefore we + * may see vre_state!=SCANNING before + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected + * on disk, but the copying progress has been synced to disk + * (and reflected in spa_ubsync). In this case it's fine to + * treat the expansion as completed, since if we crash there's + * no additional copying to do. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, + &vdrz->vn_vre); + lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, + zio->io_offset, zio->io_size, RL_READER); + use_scratch = + (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == + RRSS_SCRATCH_VALID); + synced_offset = + RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); + next_offset = vdrz->vn_vre.vre_offset; + /* + * If we haven't resumed expanding since importing the + * pool, vre_offset won't have been set yet. In + * this case the next offset to be copied is the same + * as what was synced. + */ + if (next_offset == UINT64_MAX) { + next_offset = synced_offset; + } + } + if (use_scratch) { + zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" + "%lld next_offset=%lld use_scratch=%u", + zio, + zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", + (long long)zio->io_offset, + (long long)synced_offset, + (long long)next_offset, + use_scratch); + } + + rm = vdev_raidz_map_alloc_expanded(zio, + tvd->vdev_ashift, vdrz->vd_physical_width, + logical_width, vdrz->vd_nparity, + synced_offset, next_offset, use_scratch); + rm->rm_lr = lr; + } else { + rm = vdev_raidz_map_alloc(zio, + tvd->vdev_ashift, logical_width, vdrz->vd_nparity); + } + rm->rm_original_width = vdrz->vd_original_width; - raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, - vdrz->vd_logical_width, vdrz->vd_nparity); zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; - - /* - * Until raidz expansion is implemented all maps for a raidz vdev - * contain a single row. - */ - ASSERT3U(rm->rm_nrows, ==, 1); - raidz_row_t *rr = rm->rm_row[0]; - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_write(zio, rm->rm_row[i]); + } + + if (logical_width == vdrz->vd_physical_width) { + raidz_start_skip_writes(zio); + } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); - vdev_raidz_io_start_read(zio, rr); + vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); @@ -1752,8 +2600,8 @@ vdev_raidz_io_start(zio_t *zio) /* * Report a checksum error for a child of a RAID-Z device. */ -static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) +void +vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; @@ -1765,12 +2613,12 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - (void) zfs_ereport_post_checksum(zio->io_spa, vd, - &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, - rc->rc_abd, bad_data, &zbc); mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); + (void) zfs_ereport_post_checksum(zio->io_spa, vd, + &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, + rc->rc_abd, bad_data, &zbc); } } @@ -1781,11 +2629,9 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) static int raidz_checksum_verify(zio_t *zio) { - zio_bad_cksum_t zbc; + zio_bad_cksum_t zbc = {0}; raidz_map_t *rm = zio->io_vsd; - bzero(&zbc, sizeof (zio_bad_cksum_t)); - int ret = zio_checksum_error(zio, &zbc); if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; @@ -1819,11 +2665,19 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) if (!rc->rc_tried || rc->rc_error != 0) continue; - orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); - abd_copy(orig[c], rc->rc_abd, rc->rc_size); + orig[c] = rc->rc_abd; + ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } /* + * Verify any empty sectors are zero filled to ensure the parity + * is calculated correctly even if these non-data sectors are damaged. + */ + if (rr->rr_nempty && rr->rr_abd_empty != NULL) + ret += vdev_draid_map_verify_empty(zio, rr); + + /* * Regenerates parity even for !tried||rc_error!=0 columns. This * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). @@ -1837,7 +2691,9 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { - raidz_checksum_error(zio, rc, orig[c]); + zfs_dbgmsg("found error on col=%u devidx=%u off %llx", + c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); + vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } @@ -1852,8 +2708,10 @@ vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rr->rr_cols; c++) + for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); + error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); + } return (error); } @@ -1882,6 +2740,9 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } + + if (rc->rc_force_repair) + unexpected_errors++; } /* @@ -1897,7 +2758,6 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) (zio->io_flags & ZIO_FLAG_RESILVER)) { int n = raidz_parity_verify(zio, rr); unexpected_errors += n; - ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); } if (zio->io_error == 0 && spa_writeable(zio->io_spa) && @@ -1917,6 +2777,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) continue; } + zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " + "offset=%llx", + zio, c, rc->rc_devidx, (long long)rc->rc_offset); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, @@ -1926,6 +2790,42 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + /* + * Scrub or resilver i/o's: overwrite any shadow locations with the + * good data. This ensures that if we've already copied this sector, + * it will be corrected if it was damaged. This writes more than is + * necessary, but since expansion is paused during scrub/resilver, at + * most a single row will have a shadow location. + */ + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + + if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; + + /* + * Note: We don't want to update the repair stats + * because that would incorrectly indicate that there + * was bad data to repair, which we aren't sure about. + * By clearing the SCAN_THREAD flag, we prevent this + * from happening, despite having the REPAIR flag set. + * We need to set SELF_HEAL so that this i/o can't be + * bypassed by zio_vdev_io_start(). + */ + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, + rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL); + cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; + zio_nowait(cio); + } + } } static void @@ -1945,6 +2845,43 @@ raidz_restore_orig_data(raidz_map_t *rm) } /* + * During raidz_reconstruct() for expanded VDEV, we need special consideration + * failure simulations. See note in raidz_reconstruct() on simulating failure + * of a pre-expansion device. + * + * Treating logical child i as failed, return TRUE if the given column should + * be treated as failed. The idea of logical children allows us to imagine + * that a disk silently failed before a RAIDZ expansion (reads from this disk + * succeed but return the wrong data). Since the expansion doesn't verify + * checksums, the incorrect data will be moved to new locations spread among + * the children (going diagonally across them). + * + * Higher "logical child failures" (values of `i`) indicate these + * "pre-expansion failures". The first physical_width values imagine that a + * current child failed; the next physical_width-1 values imagine that a + * child failed before the most recent expansion; the next physical_width-2 + * values imagine a child failed in the expansion before that, etc. + */ +static boolean_t +raidz_simulate_failure(int physical_width, int original_width, int ashift, + int i, raidz_col_t *rc) +{ + uint64_t sector_id = + physical_width * (rc->rc_offset >> ashift) + + rc->rc_devidx; + + for (int w = physical_width; w >= original_width; w--) { + if (i < w) { + return (sector_id % w == i); + } else { + i -= w; + } + } + ASSERT(!"invalid logical child id"); + return (B_FALSE); +} + +/* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed * returns 0 on successful reconstruction @@ -1953,6 +2890,15 @@ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; + int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; + + if (dbgmsg) { + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " + "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); + } /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { @@ -1962,6 +2908,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) int dead = 0; int dead_data = 0; + if (dbgmsg) + zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); @@ -1974,7 +2923,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { - if (rc->rc_devidx == ltgts[lt]) { + if (raidz_simulate_failure(physical_width, + original_width, + zio->io_vd->vdev_top->vdev_ashift, + ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( @@ -1987,13 +2939,37 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) dead++; if (c >= nparity) dead_data++; - my_tgts[t++] = c; + /* + * Note: simulating failure of a + * pre-expansion device can hit more + * than one column, in which case we + * might try to simulate more failures + * than can be reconstructed, which is + * also more than the size of my_tgts. + * This check prevents accessing past + * the end of my_tgts. The "dead > + * nparity" check below will fail this + * reconstruction attempt. + */ + if (t < VDEV_RAIDZ_MAXPARITY) { + my_tgts[t++] = c; + if (dbgmsg) { + zfs_dbgmsg("simulating " + "failure of col %u " + "devidx %u", c, + (int)rc->rc_devidx); + } + } break; } } } if (dead > nparity) { /* reconstruction not possible */ + if (dbgmsg) { + zfs_dbgmsg("reconstruction not possible; " + "too many failures"); + } raidz_restore_orig_data(rm); return (EINVAL); } @@ -2023,7 +2999,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) */ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { - raidz_checksum_error(zio, + vdev_raidz_checksum_error(zio, rc, rc->rc_orig_data); rc->rc_error = SET_ERROR(ECKSUM); @@ -2037,11 +3013,19 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) zio_checksum_verified(zio); + if (dbgmsg) { + zfs_dbgmsg("reconstruction successful " + "(checksum verified)"); + } return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); + if (dbgmsg) { + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " + "failed", zio); + } return (ECKSUM); } @@ -2056,7 +3040,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) - * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * @@ -2087,18 +3071,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, - * since the raidz_map_t rows likely do not overlap reconstruction would be + * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. + * + * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; - /* Check if there's enough data to attempt reconstrution. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; @@ -2116,8 +3104,16 @@ vdev_raidz_combrec(zio_t *zio) int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ - /* Determine number of logical children, n */ - int n = zio->io_vd->vdev_children; + + /* + * Determine number of logical children, n. See comment + * above raidz_simulate_failure(). + */ + int n = 0; + for (int w = physical_width; + w >= original_width; w--) { + n += w; + } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); @@ -2148,6 +3144,14 @@ vdev_raidz_combrec(zio_t *zio) if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); + if (zfs_flags & + ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruction " + "failed for num_failures=" + "%u; tried all " + "combinations", + num_failures); + } break; } @@ -2159,7 +3163,7 @@ vdev_raidz_combrec(zio_t *zio) * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) - break; + break; // found next combination /* * Otherwise, reset this tgt to the minimum, @@ -2174,7 +3178,8 @@ vdev_raidz_combrec(zio_t *zio) break; } } - + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } @@ -2199,7 +3204,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { - int total_errors = 0; + int normal_errors = 0; + int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); @@ -2208,24 +3214,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_error) { + if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - total_errors++; + normal_errors++; + } + if (rc->rc_shadow_error != 0) { + ASSERT(rc->rc_shadow_error != ECKSUM); + shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough - * columns to reconstruct the data, the I/O failed. Otherwise, - * good enough. + * columns to reconstruct the data, the I/O failed. Otherwise, good + * enough. Note that in the case of a shadow write (during raidz + * expansion), depending on if we crash, either the normal (old) or + * shadow (new) location may become the "real" version of the block, + * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ - if (total_errors > rr->rr_firstdatacol) { + if (normal_errors > rr->rr_firstdatacol || + shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } @@ -2242,14 +3255,24 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_error) { - ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + /* + * If scrubbing and a replacing/sparing child vdev determined + * that not all of its children have an identical copy of the + * data, then clear the error so the column is treated like + * any other read and force a repair to correct the damage. + */ + if (rc->rc_error == ECKSUM) { + ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); + vdev_raidz_checksum_error(zio, rc, rc->rc_abd); + rc->rc_force_repair = 1; + rc->rc_error = 0; + } + if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else @@ -2314,7 +3337,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ - if (rr->rr_nempty && rr->rr_abd_empty == NULL) + if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { @@ -2357,12 +3380,12 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - (void) zfs_ereport_start_checksum(zio->io_spa, - cvd, &zio->io_bookmark, zio, rc->rc_offset, - rc->rc_size, &zbc); mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); + (void) zfs_ereport_start_checksum(zio->io_spa, + cvd, &zio->io_bookmark, zio, rc->rc_offset, + rc->rc_size, &zbc); } } } @@ -2372,11 +3395,48 @@ vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; + ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { + if (rm->rm_phys_col) { + /* + * This is an aggregated read. Copy the data and status + * from the aggregate abd's to the individual rows. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) + continue; + + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_error = prc->rc_error; + rc->rc_tried = prc->rc_tried; + rc->rc_skipped = prc->rc_skipped; + if (c >= rr->rr_firstdatacol) { + /* + * Note: this is slightly faster + * than using abd_copy_off(). + */ + char *physbuf = abd_to_buf( + prc->rc_abd); + void *physloc = physbuf + + rc->rc_offset - + prc->rc_offset; + + abd_copy_from_buf(rc->rc_abd, + physloc, rc->rc_size); + } + } + } + } + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, @@ -2423,7 +3483,54 @@ vdev_raidz_io_done(zio_t *zio) zio_vdev_io_redone(zio); return; } - + /* + * It would be too expensive to try every possible + * combination of failed sectors in every row, so + * instead we try every combination of failed current or + * past physical disk. This means that if the incorrect + * sectors were all on Nparity disks at any point in the + * past, we will find the correct data. The only known + * case where this is less durable than a non-expanded + * RAIDZ, is if we have a silent failure during + * expansion. In that case, one block could be + * partially in the old format and partially in the + * new format, so we'd lost some sectors from the old + * format and some from the new format. + * + * e.g. logical_width=4 physical_width=6 + * the 15 (6+5+4) possible failed disks are: + * width=6 child=0 + * width=6 child=1 + * width=6 child=2 + * width=6 child=3 + * width=6 child=4 + * width=6 child=5 + * width=5 child=0 + * width=5 child=1 + * width=5 child=2 + * width=5 child=3 + * width=5 child=4 + * width=4 child=0 + * width=4 child=1 + * width=4 child=2 + * width=4 child=3 + * And we will try every combination of Nparity of these + * failing. + * + * As a first pass, we can generate every combo, + * and try reconstructing, ignoring any known + * failures. If any row has too many known + simulated + * failures, then we bail on reconstructing with this + * number of simulated failures. As an improvement, + * we could detect the number of whole known failures + * (i.e. we have known failures on these disks for + * every row; the disks never succeeded), and + * subtract that from the max # failures to simulate. + * We could go even further like the current + * combrec code, but that doesn't seem like it + * gains us very much. If we simulate a failure + * that is also a known failure, that's fine. + */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2431,6 +3538,10 @@ vdev_raidz_io_done(zio_t *zio) } } } + if (rm->rm_lr != NULL) { + zfs_rangelock_exit(rm->rm_lr); + rm->rm_lr = NULL; + } } static void @@ -2457,6 +3568,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * If we're in the middle of a RAIDZ expansion, this block may be in + * the old and/or new location. For simplicity, always resilver it. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) + return (B_TRUE); + uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; @@ -2496,10 +3615,29 @@ static void vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { + (void) remain_rs; + vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); - uint64_t width = raidvd->vdev_children; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + /* + * We're in the middle of expansion, in which case the + * translation is in flux. Any answer we give may be wrong + * by the time we return, so it isn't safe for the caller to + * act on it. Therefore we say that this range isn't present + * on any children. The only consumers of this are "zpool + * initialize" and trimming, both of which are "best effort" + * anyway. + */ + physical_rs->rs_start = physical_rs->rs_end = 0; + remain_rs->rs_start = remain_rs->rs_end = 0; + return; + } + + uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; @@ -2525,15 +3663,1156 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, logical_rs->rs_end - logical_rs->rs_start); } +static void +raidz_reflow_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + /* + * Ensure there are no i/os to the range that is being committed. + */ + uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); + + mutex_enter(&vre->vre_lock); + uint64_t new_offset = + MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); + /* + * We should not have committed anything that failed. + */ + VERIFY3U(vre->vre_failed_offset, >=, old_offset); + mutex_exit(&vre->vre_lock); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + old_offset, new_offset - old_offset, + RL_WRITER); + + /* + * Update the uberblock that will be written when this txg completes. + */ + RAIDZ_REFLOW_SET(&spa->spa_uberblock, + RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); + vre->vre_offset_pertxg[txgoff] = 0; + zfs_rangelock_exit(lr); + + mutex_enter(&vre->vre_lock); + vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; + vre->vre_bytes_copied_pertxg[txgoff] = 0; + mutex_exit(&vre->vre_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); +} + +static void +raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + for (int i = 0; i < TXG_SIZE; i++) + VERIFY0(vre->vre_offset_pertxg[i]); + + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; + re->re_logical_width = vdrz->vd_physical_width; + mutex_enter(&vdrz->vd_expand_lock); + avl_add(&vdrz->vd_expand_txgs, re); + mutex_exit(&vdrz->vd_expand_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + + /* + * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS + * will get written (based on vd_expand_txgs). + */ + vdev_config_dirty(vd); + + /* + * Before we change vre_state, the on-disk state must reflect that we + * have completed all copying, so that vdev_raidz_io_start() can use + * vre_state to determine if the reflow is in progress. See also the + * end of spa_raidz_expand_thread(). + */ + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, + raidvd->vdev_ms_count << raidvd->vdev_ms_shift); + + vre->vre_end_time = gethrestime_sec(); + vre->vre_state = DSS_FINISHED; + + uint64_t state = vre->vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t end_time = vre->vre_end_time; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time, tx)); + + spa->spa_uberblock.ub_raidz_reflow_info = 0; + + spa_history_log_internal(spa, "raidz vdev expansion completed", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)vd->vdev_id, + (unsigned long long)vd->vdev_children); + + spa->spa_raidz_expand = NULL; + raidvd->vdev_rz_expanding = B_FALSE; + + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + + spa_notify_waiters(spa); + + /* + * While we're in syncing context take the opportunity to + * setup a scrub. All the data has been sucessfully copied + * but we have not validated any checksums. + */ + pool_scan_func_t func = POOL_SCAN_SCRUB; + if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) + dsl_scan_setup_sync(&func, tx); +} + +/* + * Struct for one copy zio. + */ +typedef struct raidz_reflow_arg { + vdev_raidz_expand_t *rra_vre; + zfs_locked_range_t *rra_lr; + uint64_t rra_txg; +} raidz_reflow_arg_t; + +/* + * The write of the new location is done. + */ +static void +raidz_reflow_write_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + abd_free(zio->io_abd); + + mutex_enter(&vre->vre_lock); + if (zio->io_error != 0) { + /* Force a reflow pause on errors */ + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + } + ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); + vre->vre_outstanding_bytes -= zio->io_size; + if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < + vre->vre_failed_offset) { + vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += + zio->io_size; + } + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + zfs_rangelock_exit(rra->rra_lr); + + kmem_free(rra, sizeof (*rra)); + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ +static void +raidz_reflow_read_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + /* + * If the read failed, or if it was done on a vdev that is not fully + * healthy (e.g. a child that has a resilver in progress), we may not + * have the correct data. Note that it's OK if the write proceeds. + * It may write garbage but the location is otherwise unused and we + * will retry later due to vre_failed_offset. + */ + if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { + zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " + "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_lr->lr_length, + (long long)rra->rra_txg, + zio->io_error, + vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), + vdev_dtl_empty(zio->io_vd, DTL_MISSING)); + mutex_enter(&vre->vre_lock); + /* Force a reflow pause on errors */ + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + mutex_exit(&vre->vre_lock); + } + + zio_nowait(zio_unique_parent(zio)); +} + +static void +raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, + dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (offset == 0) + return; + + mutex_enter(&vre->vre_lock); + ASSERT3U(vre->vre_offset, <=, offset); + vre->vre_offset = offset; + mutex_exit(&vre->vre_lock); + + if (vre->vre_offset_pertxg[txgoff] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, + spa, tx); + } + vre->vre_offset_pertxg[txgoff] = offset; +} + +static boolean_t +vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) +{ + for (int i = 0; i < raidz_vd->vdev_children; i++) { + /* Quick check if a child is being replaced */ + if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, + dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + int ashift = vd->vdev_top->vdev_ashift; + uint64_t offset, size; + + if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, + &offset, &size)) { + return (B_FALSE); + } + ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); + ASSERT3U(size, >=, 1 << ashift); + uint64_t length = 1 << ashift; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + uint64_t blkid = offset >> ashift; + + int old_children = vd->vdev_children - 1; + + /* + * We can only progress to the point that writes will not overlap + * with blocks whose progress has not yet been recorded on disk. + * Since partially-copied rows are still read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent + * row-wise overlap. + * + * Note that even if we are skipping over a large unallocated region, + * we can't move the on-disk progress to `offset`, because concurrent + * writes/allocations could still use the currently-unallocated + * region. + */ + uint64_t ubsync_blkid = + RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; + uint64_t next_overwrite_blkid = ubsync_blkid + + ubsync_blkid / old_children - old_children; + VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); + + if (blkid >= next_overwrite_blkid) { + raidz_reflow_record_progress(vre, + next_overwrite_blkid << ashift, tx); + return (B_TRUE); + } + + range_tree_remove(rt, offset, length); + + raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); + rra->rra_vre = vre; + rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, + offset, length, RL_WRITER); + rra->rra_txg = dmu_tx_get_txg(tx); + + raidz_reflow_record_progress(vre, offset + length, tx); + + mutex_enter(&vre->vre_lock); + vre->vre_outstanding_bytes += length; + mutex_exit(&vre->vre_lock); + + /* + * SCL_STATE will be released when the read and write are done, + * by raidz_reflow_write_done(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* check if a replacing vdev was added, if so treat it as an error */ + if (vdev_raidz_expand_child_replacing(vd)) { + zfs_dbgmsg("replacing vdev encountered, reflow paused at " + "offset=%llu txg=%llu", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_txg); + + mutex_enter(&vre->vre_lock); + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + /* drop everything we acquired */ + zfs_rangelock_exit(rra->rra_lr); + kmem_free(rra, sizeof (*rra)); + spa_config_exit(spa, SCL_STATE, spa); + return (B_TRUE); + } + + zio_t *pio = spa->spa_txg_zio[txgoff]; + abd_t *abd = abd_alloc_for_io(length, B_FALSE); + zio_t *write_zio = zio_vdev_child_io(pio, NULL, + vd->vdev_child[blkid % vd->vdev_children], + (blkid / vd->vdev_children) << ashift, + abd, length, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_write_done, rra); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + vd->vdev_child[blkid % old_children], + (blkid / old_children) << ashift, + abd, length, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_read_done, rra)); + + return (B_FALSE); +} + +/* + * For testing (ztest specific) + */ +static void +raidz_expand_pause(uint_t pause_point) +{ + while (raidz_expand_pause_point != 0 && + raidz_expand_pause_point <= pause_point) + delay(hz); +} + +static void +raidz_scratch_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +/* + * Reflow the beginning portion of the vdev into an intermediate scratch area + * in memory and on disk. This operation must be persisted on disk before we + * proceed to overwrite the beginning portion with the reflowed data. + * + * This multi-step task can fail to complete if disk errors are encountered + * and we can return here after a pause (waiting for disk to become healthy). + */ +static void +raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) +{ + vdev_raidz_expand_t *vre = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zio_t *pio; + int error; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + int ashift = raidvd->vdev_ashift; + uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, + uint64_t); + uint64_t logical_size = write_size * raidvd->vdev_children; + uint64_t read_size = + P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), + 1 << ashift); + + /* + * The scratch space must be large enough to get us to the point + * that one row does not overlap itself when moved. This is checked + * by vdev_raidz_attach_check(). + */ + VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); + VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); + VERIFY3U(write_size, <=, read_size); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + 0, logical_size, RL_WRITER); + + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(read_size, B_FALSE); + } + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); + + /* + * If we have already written the scratch area then we must read from + * there, since new writes were redirected there while we were paused + * or the original location may have been partially overwritten with + * reflowed data. + */ + if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); + /* + * Read from scratch space. + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE + * to the offset to calculate the physical offset to + * write to. Passing in a negative offset makes us + * access the scratch area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, + raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d reading scratch location", + error); + goto io_error_exit; + } + goto overwrite; + } + + /* + * Read from original location. + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children - 1; i++) { + ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], read_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d reading original location", error); +io_error_exit: + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + return; + } + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); + + /* + * Reflow in memory. + */ + uint64_t logical_sectors = logical_size >> ashift; + for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { + int oldchild = i % (raidvd->vdev_children - 1); + uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; + + int newchild = i % raidvd->vdev_children; + uint64_t newoff = (i / raidvd->vdev_children) << ashift; + + /* a single sector should not be copying over itself */ + ASSERT(!(newchild == oldchild && newoff == oldoff)); + + abd_copy_off(abds[newchild], abds[oldchild], + newoff, oldoff, 1 << ashift); + } + + /* + * Verify that we filled in everything we intended to (write_size on + * each child). + */ + VERIFY0(logical_sectors % raidvd->vdev_children); + VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, + write_size); + + /* + * Write to scratch location (boot area). + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d writing scratch location", error); + goto io_error_exit; + } + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", + (long long)logical_size); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); + + /* + * Update uberblock to indicate that scratch space is valid. This is + * needed because after this point, the real location may be + * overwritten. If we crash, we need to get the data from the + * scratch space, rather than the real location. + * + * Note: ub_timestamp is bumped so that vdev_uberblock_compare() + * will prefer this uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); + + /* + * Overwrite with reflow'ed data. + */ +overwrite: + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, + raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + /* + * When we exit early here and drop the range lock, new + * writes will go into the scratch area so we'll need to + * read from there when we return after pausing. + */ + zfs_dbgmsg("reflow: error %d writing real location", error); + /* + * Update the uberblock that is written when this txg completes. + */ + RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, + logical_size); + goto io_error_exit; + } + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", + (long long)logical_size); + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); + + /* + * Update uberblock to indicate that the initial part has been + * reflow'ed. This is needed because after this point (when we exit + * the rangelock), we allow regular writes to this region, which will + * be written to the new location only (because reflow_offset_next == + * reflow_offset_synced). If we crashed and re-copied from the + * scratch space, we would lose the regular writes. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, + logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); + + /* + * Update progress. + */ + vre->vre_offset = logical_size; + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + /* + * Note - raidz_reflow_sync() will update the uberblock state to + * RRSS_SCRATCH_INVALID_SYNCED_REFLOW + */ + raidz_reflow_sync(spa, tx); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); +} + +/* + * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work + * here. No other i/o can be in progress, so we don't need the vre_rangelock. + */ +void +vdev_raidz_reflow_copy_scratch(spa_t *spa) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + ASSERT0(logical_size % raidvd->vdev_children); + uint64_t write_size = logical_size / raidvd->vdev_children; + + zio_t *pio; + + /* + * Read from scratch space. + */ + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(write_size, B_FALSE); + } + + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + + /* + * Overwrite real location with reflow'ed data. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " + "to real location", (long long)logical_size); + + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + /* + * Update uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); + spa->spa_ubsync.ub_timestamp++; + VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow recovery: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + spa_first_txg(spa)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset = logical_size; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + /* + * Note that raidz_reflow_sync() will update the uberblock once more + */ + raidz_reflow_sync(spa, tx); + + dmu_tx_commit(tx); + + spa_config_exit(spa, SCL_STATE, FTAG); +} + +static boolean_t +spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) +{ + (void) zthr; + spa_t *spa = arg; + + return (spa->spa_raidz_expand != NULL && + !spa->spa_raidz_expand->vre_waiting_for_resilver); +} + +/* + * RAIDZ expansion background thread + * + * Can be called multiple times if the reflow is paused + */ +static void +spa_raidz_expand_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) + vre->vre_offset = 0; + else + vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); + + /* Reflow the begining portion using the scratch area */ + if (vre->vre_offset == 0) { + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, raidz_reflow_scratch_sync, + vre, 0, ZFS_SPACE_CHECK_NONE)); + + /* if we encountered errors then pause */ + if (vre->vre_offset == 0) { + mutex_enter(&vre->vre_lock); + vre->vre_waiting_for_resilver = B_TRUE; + mutex_exit(&vre->vre_lock); + return; + } + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + uint64_t guid = raidvd->vdev_guid; + + /* Iterate over all the remaining metaslabs */ + for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; + i < raidvd->vdev_ms_count && + !zthr_iscancelled(zthr) && + vre->vre_failed_offset == UINT64_MAX; i++) { + metaslab_t *msp = raidvd->vdev_ms[i]; + + metaslab_disable(msp); + mutex_enter(&msp->ms_lock); + + /* + * The metaslab may be newly created (for the expanded + * space), in which case its trees won't exist yet, + * so we need to bail out early. + */ + if (msp->ms_new) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + continue; + } + + VERIFY0(metaslab_load(msp)); + + /* + * We want to copy everything except the free (allocatable) + * space. Note that there may be a little bit more free + * space (e.g. in ms_defer), and it's fine to copy that too. + */ + range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, + NULL, 0, 0); + range_tree_add(rt, msp->ms_start, msp->ms_size); + range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); + mutex_exit(&msp->ms_lock); + + /* + * Force the last sector of each metaslab to be copied. This + * ensures that we advance the on-disk progress to the end of + * this metaslab while the metaslab is disabled. Otherwise, we + * could move past this metaslab without advancing the on-disk + * progress, and then an allocation to this metaslab would not + * be copied. + */ + int sectorsz = 1 << raidvd->vdev_ashift; + uint64_t ms_last_offset = msp->ms_start + + msp->ms_size - sectorsz; + if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { + range_tree_add(rt, ms_last_offset, sectorsz); + } + + /* + * When we are resuming from a paused expansion (i.e. + * when importing a pool with a expansion in progress), + * discard any state that we have already processed. + */ + range_tree_clear(rt, 0, vre->vre_offset); + + while (!zthr_iscancelled(zthr) && + !range_tree_is_empty(rt) && + vre->vre_failed_offset == UINT64_MAX) { + + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * If requested, pause the reflow when the amount + * specified by raidz_expand_max_reflow_bytes is reached + * + * This pause is only used during testing or debugging. + */ + while (raidz_expand_max_reflow_bytes != 0 && + raidz_expand_max_reflow_bytes <= + vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { + delay(hz); + } + + mutex_enter(&vre->vre_lock); + while (vre->vre_outstanding_bytes > + raidz_expand_max_copy_bytes) { + cv_wait(&vre->vre_cv, &vre->vre_lock); + } + mutex_exit(&vre->vre_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. Theoretically, the + * vdev_t that we're expanding may have changed. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + boolean_t needsync = + raidz_reflow_impl(raidvd, vre, rt, tx); + + dmu_tx_commit(tx); + + if (needsync) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(spa->spa_dsl_pool, txg); + spa_config_enter(spa, SCL_CONFIG, FTAG, + RW_READER); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + metaslab_enable(msp, B_FALSE, B_FALSE); + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * The txg_wait_synced() here ensures that all reflow zio's have + * completed, and vre_failed_offset has been set if necessary. It + * also ensures that the progress of the last raidz_reflow_sync() is + * written to disk before raidz_reflow_complete_sync() changes the + * in-memory vre_state. vdev_raidz_io_start() uses vre_state to + * determine if a reflow is in progress, in which case we may need to + * write to both old and new locations. Therefore we can only change + * vre_state once this is not necessary, which is once the on-disk + * progress (in spa_ubsync) has been set past any possible writes (to + * the end of the last metaslab). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + if (!zthr_iscancelled(zthr) && + vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { + /* + * We are not being canceled or paused, so the reflow must be + * complete. In that case also mark it as completed on disk. + */ + ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + raidz_reflow_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); + } else { + /* + * Wait for all copy zio's to complete and for all the + * raidz_reflow_sync() synctasks to be run. + */ + spa_history_log_internal(spa, "reflow pause", + NULL, "offset=%llu failed_offset=%lld", + (long long)vre->vre_offset, + (long long)vre->vre_failed_offset); + mutex_enter(&vre->vre_lock); + if (vre->vre_failed_offset != UINT64_MAX) { + /* + * Reset progress so that we will retry everything + * after the point that something failed. + */ + vre->vre_offset = vre->vre_failed_offset; + vre->vre_failed_offset = UINT64_MAX; + vre->vre_waiting_for_resilver = B_TRUE; + } + mutex_exit(&vre->vre_lock); + } +} + +void +spa_start_raidz_expansion_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); + spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", + spa_raidz_expand_thread_check, spa_raidz_expand_thread, + spa, defclsyspri); +} + +void +raidz_dtl_reassessed(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + if (spa->spa_raidz_expand != NULL) { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + /* + * we get called often from vdev_dtl_reassess() so make + * sure it's our vdev and any replacing is complete + */ + if (vd->vdev_top->vdev_id == vre->vre_vdev_id && + !vdev_raidz_expand_child_replacing(vd->vdev_top)) { + mutex_enter(&vre->vre_lock); + if (vre->vre_waiting_for_resilver) { + vdev_dbgmsg(vd, "DTL reassessed, " + "continuing raidz expansion"); + vre->vre_waiting_for_resilver = B_FALSE; + zthr_wakeup(spa->spa_raidz_expand_zthr); + } + mutex_exit(&vre->vre_lock); + } + } +} + +int +vdev_raidz_attach_check(vdev_t *new_child) +{ + vdev_t *raidvd = new_child->vdev_parent; + uint64_t new_children = raidvd->vdev_children; + + /* + * We use the "boot" space as scratch space to handle overwriting the + * initial part of the vdev. If it is too small, then this expansion + * is not allowed. This would be very unusual (e.g. ashift > 13 and + * >200 children). + */ + if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { + return (EINVAL); + } + return (0); +} + +void +vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *new_child = arg; + spa_t *spa = new_child->vdev_spa; + vdev_t *raidvd = new_child->vdev_parent; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(raidvd->vdev_top, ==, raidvd); + ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); + ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); + ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, + new_child); + + spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); + + vdrz->vd_physical_width++; + + VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); + vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; + vdrz->vn_vre.vre_offset = 0; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + spa->spa_raidz_expand = &vdrz->vn_vre; + zthr_wakeup(spa->spa_raidz_expand_zthr); + + /* + * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get + * written to the config. + */ + vdev_config_dirty(raidvd); + + vdrz->vn_vre.vre_start_time = gethrestime_sec(); + vdrz->vn_vre.vre_end_time = 0; + vdrz->vn_vre.vre_state = DSS_SCANNING; + vdrz->vn_vre.vre_bytes_copied = 0; + + uint64_t state = vdrz->vn_vre.vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t start_time = vdrz->vn_vre.vre_start_time; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time, tx)); + + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); + + spa_history_log_internal(spa, "raidz vdev expansion started", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)raidvd->vdev_id, + (unsigned long long)raidvd->vdev_children); +} + +int +vdev_raidz_load(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + int err; + + uint64_t state = DSS_NONE; + uint64_t start_time = 0; + uint64_t end_time = 0; + uint64_t bytes_copied = 0; + + if (vd->vdev_top_zap != 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (bytes_copied), 1, &bytes_copied); + if (err != 0 && err != ENOENT) + return (err); + } + + /* + * If we are in the middle of expansion, vre_state should have + * already been set by vdev_raidz_init(). + */ + EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); + vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; + vdrz->vn_vre.vre_start_time = start_time; + vdrz->vn_vre.vre_end_time = end_time; + vdrz->vn_vre.vre_bytes_copied = bytes_copied; + + return (0); +} + +int +spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (vre == NULL) { + /* no removal in progress; find most recent completed */ + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_raidz_t *vdrz = vd->vdev_tsd; + + if (vdrz->vn_vre.vre_end_time != 0 && + (vre == NULL || + vdrz->vn_vre.vre_end_time > + vre->vre_end_time)) { + vre = &vdrz->vn_vre; + } + } + } + } + + if (vre == NULL) { + return (SET_ERROR(ENOENT)); + } + + pres->pres_state = vre->vre_state; + pres->pres_expanding_vdev = vre->vre_vdev_id; + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + pres->pres_to_reflow = vd->vdev_stat.vs_alloc; + + mutex_enter(&vre->vre_lock); + pres->pres_reflowed = vre->vre_bytes_copied; + for (int i = 0; i < TXG_SIZE; i++) + pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; + mutex_exit(&vre->vre_lock); + + pres->pres_start_time = vre->vre_start_time; + pres->pres_end_time = vre->vre_end_time; + pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; + + return (0); +} + /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { - vdev_raidz_t *vdrz; - uint64_t nparity; - uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, @@ -2541,6 +4820,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) if (error != 0) return (SET_ERROR(EINVAL)); + uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); @@ -2567,10 +4847,56 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) nparity = 1; } - vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); - vdrz->vd_logical_width = children; + vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vn_vre.vre_vdev_id = -1; + vdrz->vn_vre.vre_offset = UINT64_MAX; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); + zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); + mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, + sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); + + vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; + /* note, the ID does not exist when creating a pool */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &vdrz->vn_vre.vre_vdev_id); + + boolean_t reflow_in_progress = + nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + if (reflow_in_progress) { + spa->spa_raidz_expand = &vdrz->vn_vre; + vdrz->vn_vre.vre_state = DSS_SCANNING; + } + + vdrz->vd_original_width = children; + uint64_t *txgs; + unsigned int txgs_size = 0; + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + &txgs, &txgs_size); + if (error == 0) { + for (int i = 0; i < txgs_size; i++) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = txgs[txgs_size - i - 1]; + re->re_logical_width = vdrz->vd_physical_width - i; + + if (reflow_in_progress) + re->re_logical_width--; + + avl_add(&vdrz->vd_expand_txgs, re); + } + + vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; + } + if (reflow_in_progress) { + vdrz->vd_original_width--; + zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", + children, txgs_size); + } + *tsd = vdrz; return (0); @@ -2579,7 +4905,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) static void vdev_raidz_fini(vdev_t *vd) { - kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) + vd->vdev_spa->spa_raidz_expand = NULL; + reflow_node_t *re; + void *cookie = NULL; + avl_tree_t *tree = &vdrz->vd_expand_txgs; + while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) + kmem_free(re, sizeof (*re)); + avl_destroy(&vdrz->vd_expand_txgs); + mutex_destroy(&vdrz->vd_expand_lock); + mutex_destroy(&vdrz->vn_vre.vre_lock); + cv_destroy(&vdrz->vn_vre.vre_cv); + zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); + kmem_free(vdrz, sizeof (*vdrz)); } /* @@ -2607,6 +4946,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + } + + mutex_enter(&vdrz->vd_expand_lock); + if (!avl_is_empty(&vdrz->vd_expand_txgs)) { + uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); + uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + uint64_t i = 0; + + for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); + re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { + txgs[i++] = re->re_txg; + } + + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + txgs, count); + + kmem_free(txgs, sizeof (uint64_t) * count); + } + mutex_exit(&vdrz->vd_expand_lock); } static uint64_t @@ -2646,3 +5008,15 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, + "For testing, pause RAIDZ expansion after reflowing this many bytes"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, + "Max amount of concurrent i/o for RAIDZ expansion"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, + "For expanded RAIDZ, aggregate reads that have more rows than this"); +ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, + "For expanded RAIDZ, automatically start a pool scrub when expansion " + "completes"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c index 03df2df5adaf..e12b96170f55 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,6 +22,7 @@ * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ +#include <sys/simd.h> #include <sys/zfs_context.h> #include <sys/types.h> #include <sys/zio.h> @@ -29,7 +30,6 @@ #include <sys/zfs_debug.h> #include <sys/vdev_raidz.h> #include <sys/vdev_raidz_impl.h> -#include <sys/simd.h> /* Opaque implementation with NULL methods to represent original methods */ static const raidz_impl_ops_t vdev_raidz_original_impl = { @@ -43,7 +43,7 @@ static raidz_impl_ops_t vdev_raidz_fastest_impl = { }; /* All compiled in implementations */ -const raidz_impl_ops_t *raidz_all_maths[] = { +static const raidz_impl_ops_t *const raidz_all_maths[] = { &vdev_raidz_original_impl, &vdev_raidz_scalar_impl, #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */ @@ -268,10 +268,10 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr, return (rec_fn(rr, dt)); } -const char *raidz_gen_name[] = { +const char *const raidz_gen_name[] = { "gen_p", "gen_pq", "gen_pqr" }; -const char *raidz_rec_name[] = { +const char *const raidz_rec_name[] = { "rec_p", "rec_q", "rec_r", "rec_pq", "rec_pr", "rec_qr", "rec_pqr" }; @@ -283,22 +283,19 @@ const char *raidz_rec_name[] = { static int raidz_math_kstat_headers(char *buf, size_t size) { - int i; - ssize_t off; - ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); - off = snprintf(buf, size, "%-17s", "implementation"); + ssize_t off = kmem_scnprintf(buf, size, "%-17s", "implementation"); - for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) - off += snprintf(buf + off, size - off, "%-16s", + for (int i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_gen_name[i]); - for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) - off += snprintf(buf + off, size - off, "%-16s", + for (int i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_rec_name[i]); - (void) snprintf(buf + off, size - off, "\n"); + (void) kmem_scnprintf(buf + off, size - off, "\n"); return (0); } @@ -314,34 +311,35 @@ raidz_math_kstat_data(char *buf, size_t size, void *data) ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); if (cstat == fstat) { - off += snprintf(buf + off, size - off, "%-17s", "fastest"); + off += kmem_scnprintf(buf + off, size - off, "%-17s", + "fastest"); for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) { int id = fstat->gen[i]; - off += snprintf(buf + off, size - off, "%-16s", + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_supp_impl[id]->name); } for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) { int id = fstat->rec[i]; - off += snprintf(buf + off, size - off, "%-16s", + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_supp_impl[id]->name); } } else { ptrdiff_t id = cstat - raidz_impl_kstats; - off += snprintf(buf + off, size - off, "%-17s", + off += kmem_scnprintf(buf + off, size - off, "%-17s", raidz_supp_impl[id]->name); for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) - off += snprintf(buf + off, size - off, "%-16llu", + off += kmem_scnprintf(buf + off, size - off, "%-16llu", (u_longlong_t)cstat->gen[i]); for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) - off += snprintf(buf + off, size - off, "%-16llu", + off += kmem_scnprintf(buf + off, size - off, "%-16llu", (u_longlong_t)cstat->rec[i]); } - (void) snprintf(buf + off, size - off, "\n"); + (void) kmem_scnprintf(buf + off, size - off, "\n"); return (0); } @@ -566,7 +564,7 @@ vdev_raidz_math_fini(void) } static const struct { - char *name; + const char *name; uint32_t sel; } math_impl_opts[] = { { "cycle", IMPL_CYCLE }, @@ -655,13 +653,15 @@ zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; - cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); + cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, + math_impl_opts[i].name); } /* list all supported implementations */ for (i = 0; i < raidz_supp_impl_cnt; i++) { fmt = (i == impl) ? "[%s] " : "%s "; - cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); + cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, + raidz_supp_impl[i]->name); } return (cnt); diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c index 0a67ceb84920..4aa7bc2b9708 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h index e46b2536546c..f0f6546f7f71 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c index e072f51cd635..bd9de91a4ba8 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -210,9 +210,13 @@ DEFINE_GEN_METHODS(aarch64_neonx2); * If compiled with -O0, gcc doesn't do any stack frame coalescing * and -Wframe-larger-than=1024 is triggered in debug mode. */ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif DEFINE_REC_METHODS(aarch64_neonx2); +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop +#endif static boolean_t raidz_will_aarch64_neonx2_work(void) diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c index 65e4bebce8fa..e5bbc7decbfa 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c index f06b469023eb..3b709ed34fc4 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c index aab653b77491..5ec71a04133a 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h index 35e016fc65a5..5d77c5d046d5 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -214,9 +214,10 @@ raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) } -#define raidz_copy(dabd, sabd, size) \ +#define raidz_copy(dabd, sabd, off, size) \ { \ - abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ + abd_iterate_func2(dabd, sabd, off, off, size, raidz_copy_abd_cb, \ + NULL); \ } /* @@ -254,9 +255,10 @@ raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) return (0); } -#define raidz_add(dabd, sabd, size) \ +#define raidz_add(dabd, sabd, off, size) \ { \ - abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ + abd_iterate_func2(dabd, sabd, off, off, size, raidz_add_abd_cb, \ + NULL); \ } /* @@ -343,7 +345,10 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private) * the parity/syndrome if data column is shorter. * * P parity is calculated using raidz_add_abd(). + * + * For CPU L2 cache blocking we process 64KB at a time. */ +#define CHUNK 65536 /* * Generate P parity (RAIDZ1) @@ -357,20 +362,26 @@ raidz_generate_p_impl(raidz_row_t * const rr) const size_t ncols = rr->rr_cols; const size_t psize = rr->rr_col[CODE_P].rc_size; abd_t *pabd = rr->rr_col[CODE_P].rc_abd; - size_t size; - abd_t *dabd; + size_t off, size; raidz_math_begin(); - /* start with first data column */ - raidz_copy(pabd, rr->rr_col[1].rc_abd, psize); + for (off = 0; off < psize; off += CHUNK) { + + /* start with first data column */ + size = MIN(CHUNK, psize - off); + raidz_copy(pabd, rr->rr_col[1].rc_abd, off, size); - for (c = 2; c < ncols; c++) { - dabd = rr->rr_col[c].rc_abd; - size = rr->rr_col[c].rc_size; + for (c = 2; c < ncols; c++) { + size = rr->rr_col[c].rc_size; + if (size <= off) + continue; - /* add data column */ - raidz_add(pabd, dabd, size); + /* add data column */ + size = MIN(CHUNK, size - off); + abd_t *dabd = rr->rr_col[c].rc_abd; + raidz_add(pabd, dabd, off, size); + } } raidz_math_end(); @@ -423,7 +434,7 @@ raidz_generate_pq_impl(raidz_row_t * const rr) size_t c; const size_t ncols = rr->rr_cols; const size_t csize = rr->rr_col[CODE_P].rc_size; - size_t dsize; + size_t off, size, dsize; abd_t *dabd; abd_t *cabds[] = { rr->rr_col[CODE_P].rc_abd, @@ -432,15 +443,20 @@ raidz_generate_pq_impl(raidz_row_t * const rr) raidz_math_begin(); - raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize); + for (off = 0; off < csize; off += CHUNK) { - for (c = 3; c < ncols; c++) { - dabd = rr->rr_col[c].rc_abd; - dsize = rr->rr_col[c].rc_size; + size = MIN(CHUNK, csize - off); + raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, off, size); + raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, off, size); - abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, - raidz_gen_pq_add); + for (c = 3; c < ncols; c++) { + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; + dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0; + + abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 2, + raidz_gen_pq_add); + } } raidz_math_end(); @@ -460,8 +476,8 @@ static void raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, const size_t dsize) { - v_t *p = (v_t *)c[0]; - v_t *q = (v_t *)c[1]; + v_t *p = (v_t *)c[CODE_P]; + v_t *q = (v_t *)c[CODE_Q]; v_t *r = (v_t *)c[CODE_R]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); @@ -486,7 +502,7 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, /* - * Generate PQR parity (RAIDZ2) + * Generate PQR parity (RAIDZ3) * * @rr RAIDZ row */ @@ -496,7 +512,7 @@ raidz_generate_pqr_impl(raidz_row_t * const rr) size_t c; const size_t ncols = rr->rr_cols; const size_t csize = rr->rr_col[CODE_P].rc_size; - size_t dsize; + size_t off, size, dsize; abd_t *dabd; abd_t *cabds[] = { rr->rr_col[CODE_P].rc_abd, @@ -506,16 +522,21 @@ raidz_generate_pqr_impl(raidz_row_t * const rr) raidz_math_begin(); - raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize); + for (off = 0; off < csize; off += CHUNK) { - for (c = 4; c < ncols; c++) { - dabd = rr->rr_col[c].rc_abd; - dsize = rr->rr_col[c].rc_size; + size = MIN(CHUNK, csize - off); + raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, off, size); + raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, off, size); + raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, off, size); - abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, - raidz_gen_pqr_add); + for (c = 4; c < ncols; c++) { + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; + dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0; + + abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 3, + raidz_gen_pqr_add); + } } raidz_math_end(); @@ -592,26 +613,31 @@ raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx) const size_t x = tgtidx[TARGET_X]; const size_t xsize = rr->rr_col[x].rc_size; abd_t *xabd = rr->rr_col[x].rc_abd; - size_t size; - abd_t *dabd; + size_t off, size; if (xabd == NULL) return (1 << CODE_P); raidz_math_begin(); - /* copy P into target */ - raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize); + for (off = 0; off < xsize; off += CHUNK) { - /* generate p_syndrome */ - for (c = firstdc; c < ncols; c++) { - if (c == x) - continue; + /* copy P into target */ + size = MIN(CHUNK, xsize - off); + raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, off, size); - dabd = rr->rr_col[c].rc_abd; - size = MIN(rr->rr_col[c].rc_size, xsize); + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; + size = rr->rr_col[c].rc_size; + if (size <= off) + continue; - raidz_add(xabd, dabd, size); + size = MIN(CHUNK, MIN(size, xsize) - off); + abd_t *dabd = rr->rr_col[c].rc_abd; + raidz_add(xabd, dabd, off, size); + } } raidz_math_end(); @@ -683,7 +709,7 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); } @@ -698,12 +724,12 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx) dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1, raidz_syn_q_abd); } /* add Q to the syndrome */ - raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, 0, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); @@ -777,7 +803,7 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); } @@ -793,12 +819,12 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx) dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1, raidz_syn_r_abd); } /* add R to the syndrome */ - raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, 0, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); @@ -934,8 +960,8 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -951,7 +977,7 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2, raidz_syn_pq_abd); } @@ -959,7 +985,7 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) /* Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); raidz_math_end(); @@ -1094,8 +1120,8 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1111,7 +1137,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2, raidz_syn_pr_abd); } @@ -1121,7 +1147,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); raidz_math_end(); @@ -1261,8 +1287,8 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1278,7 +1304,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2, raidz_syn_qr_abd); } @@ -1288,7 +1314,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); raidz_math_end(); @@ -1456,9 +1482,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1475,7 +1501,7 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 3, raidz_syn_pqr_abd); } @@ -1485,9 +1511,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); if (zsize < xsize) - raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize); + raidz_copy(rr->rr_col[z].rc_abd, zabd, 0, zsize); raidz_math_end(); diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c index 1db2c4cd3a47..ff493b8b7bc0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h index 3842f5fd637c..f76eb47a9c66 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -26,10 +26,6 @@ #include <sys/types.h> #include <sys/simd.h> -#ifdef __linux__ -#define __asm __asm__ __volatile__ -#endif - #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) @@ -142,7 +138,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 8: \ - __asm( \ + __asm__ __volatile__( \ "lvx 21,0,%[SRC0]\n" \ "lvx 20,0,%[SRC1]\n" \ "lvx 19,0,%[SRC2]\n" \ @@ -172,7 +168,7 @@ typedef struct v { : "v18", "v19", "v20", "v21"); \ break; \ case 4: \ - __asm( \ + __asm__ __volatile__( \ "lvx 21,0,%[SRC0]\n" \ "lvx 20,0,%[SRC1]\n" \ "lvx 19,0,%[SRC2]\n" \ @@ -189,7 +185,7 @@ typedef struct v { : "v18", "v19", "v20", "v21"); \ break; \ case 2: \ - __asm( \ + __asm__ __volatile__( \ "lvx 21,0,%[SRC0]\n" \ "lvx 20,0,%[SRC1]\n" \ "vxor " VR0(r) "," VR0(r) ",21\n" \ @@ -208,7 +204,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 8: \ - __asm( \ + __asm__ __volatile__( \ "vxor " VR4(r) "," VR4(r) "," VR0(r) "\n" \ "vxor " VR5(r) "," VR5(r) "," VR1(r) "\n" \ "vxor " VR6(r) "," VR6(r) "," VR2(r) "\n" \ @@ -217,7 +213,7 @@ typedef struct v { : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ break; \ case 4: \ - __asm( \ + __asm__ __volatile__( \ "vxor " VR2(r) "," VR2(r) "," VR0(r) "\n" \ "vxor " VR3(r) "," VR3(r) "," VR1(r) "\n" \ : UVR2(r), UVR3(r) \ @@ -232,7 +228,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 8: \ - __asm( \ + __asm__ __volatile__( \ "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \ @@ -245,7 +241,7 @@ typedef struct v { WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ break; \ case 4: \ - __asm( \ + __asm__ __volatile__( \ "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \ @@ -253,7 +249,7 @@ typedef struct v { : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \ break; \ case 2: \ - __asm( \ + __asm__ __volatile__( \ "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ : WVR0(r), WVR1(r)); \ @@ -267,7 +263,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 8: \ - __asm( \ + __asm__ __volatile__( \ "vor " VR4(r) "," VR0(r) "," VR0(r) "\n" \ "vor " VR5(r) "," VR1(r) "," VR1(r) "\n" \ "vor " VR6(r) "," VR2(r) "," VR2(r) "\n" \ @@ -276,7 +272,7 @@ typedef struct v { : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ break; \ case 4: \ - __asm( \ + __asm__ __volatile__( \ "vor " VR2(r) "," VR0(r) "," VR0(r) "\n" \ "vor " VR3(r) "," VR1(r) "," VR1(r) "\n" \ : WVR2(r), WVR3(r) \ @@ -291,7 +287,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 8: \ - __asm( \ + __asm__ __volatile__( \ "lvx " VR0(r) " ,0,%[SRC0]\n" \ "lvx " VR1(r) " ,0,%[SRC1]\n" \ "lvx " VR2(r) " ,0,%[SRC2]\n" \ @@ -312,7 +308,7 @@ typedef struct v { [SRC7] "r" ((OFFSET(src, 112)))); \ break; \ case 4: \ - __asm( \ + __asm__ __volatile__( \ "lvx " VR0(r) " ,0,%[SRC0]\n" \ "lvx " VR1(r) " ,0,%[SRC1]\n" \ "lvx " VR2(r) " ,0,%[SRC2]\n" \ @@ -324,7 +320,7 @@ typedef struct v { [SRC3] "r" ((OFFSET(src, 48)))); \ break; \ case 2: \ - __asm( \ + __asm__ __volatile__( \ "lvx " VR0(r) " ,0,%[SRC0]\n" \ "lvx " VR1(r) " ,0,%[SRC1]\n" \ : WVR0(r), WVR1(r) \ @@ -340,7 +336,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 8: \ - __asm( \ + __asm__ __volatile__( \ "stvx " VR0(r) " ,0,%[DST0]\n" \ "stvx " VR1(r) " ,0,%[DST1]\n" \ "stvx " VR2(r) " ,0,%[DST2]\n" \ @@ -362,7 +358,7 @@ typedef struct v { : "memory"); \ break; \ case 4: \ - __asm( \ + __asm__ __volatile__( \ "stvx " VR0(r) " ,0,%[DST0]\n" \ "stvx " VR1(r) " ,0,%[DST1]\n" \ "stvx " VR2(r) " ,0,%[DST2]\n" \ @@ -375,7 +371,7 @@ typedef struct v { : "memory"); \ break; \ case 2: \ - __asm( \ + __asm__ __volatile__( \ "stvx " VR0(r) " ,0,%[DST0]\n" \ "stvx " VR1(r) " ,0,%[DST1]\n" \ : : [DST0] "r" ((OFFSET(dst, 0))), \ @@ -400,7 +396,7 @@ typedef struct v { #define MUL2_SETUP() \ { \ - __asm( \ + __asm__ __volatile__( \ "vspltisb " VR(16) ",14\n" \ "vspltisb " VR(17) ",15\n" \ "vaddubm " VR(16) "," VR(17) "," VR(16) "\n" \ @@ -412,7 +408,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 4: \ - __asm( \ + __asm__ __volatile__( \ "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \ "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \ "vcmpgtsb 21," VR(17) "," VR2(r) "\n" \ @@ -434,7 +430,7 @@ typedef struct v { : "v18", "v19", "v20", "v21"); \ break; \ case 2: \ - __asm( \ + __asm__ __volatile__( \ "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \ "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \ "vand 19,19," VR(16) "\n" \ @@ -478,7 +474,7 @@ typedef struct v { { \ switch (REG_CNT(r)) { \ case 2: \ - __asm( \ + __asm__ __volatile__( \ /* lts for upper part */ \ "vspltisb 15,15\n" \ "lvx 10,0,%[lt0]\n" \ diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c index 9e9c15ff4ba2..b51352b4e90b 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -142,7 +142,7 @@ static const struct { a.b[6] = mul_lt[a.b[6]]; \ a.b[5] = mul_lt[a.b[5]]; \ a.b[4] = mul_lt[a.b[4]]; \ - fallthrough; \ + zfs_fallthrough; \ case 4: \ a.b[3] = mul_lt[a.b[3]]; \ a.b[2] = mul_lt[a.b[2]]; \ diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c index 56a0b123d952..02b5d6a609ab 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c index 5ddc079a4f5d..244f137b3d09 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c index 4d7de0c6c44c..8a8b02cab5c6 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c +++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,6 +22,8 @@ * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. + * Copyright (c) 2024 by Delphix. All rights reserved. */ #include <sys/vdev_impl.h> @@ -33,6 +35,7 @@ #include <sys/zio.h> #include <sys/dmu_tx.h> #include <sys/arc.h> +#include <sys/arc_impl.h> #include <sys/zap.h> /* @@ -103,7 +106,7 @@ * Size of rebuild reads; defaults to 1MiB per data disk and is capped at * SPA_MAXBLOCKSIZE. */ -unsigned long zfs_rebuild_max_segment = 1024 * 1024; +static uint64_t zfs_rebuild_max_segment = 1024 * 1024; /* * Maximum number of parallelly executed bytes per leaf vdev caused by a @@ -115,25 +118,25 @@ unsigned long zfs_rebuild_max_segment = 1024 * 1024; * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep * the queue depth short. * - * 32MB was selected as the default value to achieve good performance with - * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential - * rebuild was unable to saturate all of the drives using smaller values. - * With a value of 32MB the sequential resilver write rate was measured at - * 800MB/s sustained while rebuilding to a distributed spare. + * 64MB was observed to deliver the best performance and set as the default. + * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) + * and a rebuild rate of 1.2GB/s was measured to the distribute spare. + * Smaller values were unable to fully saturate the available pool I/O. */ -unsigned long zfs_rebuild_vdev_limit = 32 << 20; +static uint64_t zfs_rebuild_vdev_limit = 64 << 20; /* * Automatically start a pool scrub when the last active sequential resilver * completes in order to verify the checksums of all blocks which have been * resilvered. This option is enabled by default and is strongly recommended. */ -int zfs_rebuild_scrub_enabled = 1; +static int zfs_rebuild_scrub_enabled = 1; /* * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ -static void vdev_rebuild_thread(void *arg); +static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg); +static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx); /* * Clear the per-vdev rebuild bytes value for a vdev tree. @@ -227,7 +230,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); mutex_enter(&vd->vdev_rebuild_lock); - bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; vrp->vrp_min_txg = 0; vrp->vrp_max_txg = dmu_tx_get_txg(tx); @@ -260,7 +263,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) } static void -vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) +vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, const char *name) { nvlist_t *aux = fnvlist_alloc(); @@ -307,6 +310,17 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); + + /* + * Handle a second device failure if it occurs after all rebuild I/O + * has completed but before this sync task has been executed. + */ + if (vd->vdev_rebuild_reset_wanted) { + mutex_exit(&vd->vdev_rebuild_lock); + vdev_rebuild_reset_sync(arg, tx); + return; + } + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; vrp->vrp_end_time = gethrestime_sec(); @@ -448,7 +462,7 @@ vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) } clear_rebuild_bytes(vd); - bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { @@ -558,8 +572,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) vdev_rebuild_blkptr_init(&blk, vd, start, size); uint64_t psize = BP_GET_PSIZE(&blk); - if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) { + vr->vr_pass_bytes_skipped += size; return (0); + } mutex_enter(&vr->vr_io_lock); @@ -701,7 +717,7 @@ vdev_rebuild_load(vdev_t *vd) vd->vdev_rebuilding = B_FALSE; if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { - bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); mutex_exit(&vd->vdev_rebuild_lock); return (SET_ERROR(ENOTSUP)); } @@ -718,7 +734,7 @@ vdev_rebuild_load(vdev_t *vd) * status allowing a new resilver/rebuild to be started. */ if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { - bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); } else if (err) { mutex_exit(&vd->vdev_rebuild_lock); return (err); @@ -736,11 +752,12 @@ vdev_rebuild_load(vdev_t *vd) * Each scan thread is responsible for rebuilding a top-level vdev. The * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. */ -static void +static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int error = 0; /* @@ -760,7 +777,6 @@ vdev_rebuild_thread(void *arg) ASSERT(vd->vdev_rebuilding); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); - ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; @@ -773,9 +789,7 @@ vdev_rebuild_thread(void *arg) vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; - - vr->vr_bytes_inflight_max = MAX(1ULL << 20, - zfs_rebuild_vdev_limit * vd->vdev_children); + vr->vr_pass_bytes_skipped = 0; uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -793,6 +807,17 @@ vdev_rebuild_thread(void *arg) vr->vr_scan_msp = msp; /* + * Calculate the max number of in-flight bytes for top-level + * vdev scanning operations (minimum 1MB, maximum 1/2 of + * arc_c_max shared by all top-level vdevs). Limits for the + * issuing phase are done per top-level vdev and are handled + * separately. + */ + uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1); + vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children)); + + /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The * vdev_rebuild_cancel_wanted flag is set until the sync task @@ -1047,7 +1072,8 @@ vdev_rebuild_restart_impl(vdev_t *vd) void vdev_rebuild_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); vdev_rebuild_restart_impl(spa->spa_root_vdev); } @@ -1061,7 +1087,8 @@ vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) @@ -1111,7 +1138,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); if (error == ENOENT) { - bzero(vrs, sizeof (vdev_rebuild_stat_t)); + memset(vrs, 0, sizeof (vdev_rebuild_stat_t)); vrs->vrs_state = VDEV_REBUILD_NONE; error = 0; } else if (error == 0) { @@ -1132,19 +1159,18 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) vr->vr_pass_start_time); vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped; mutex_exit(&tvd->vdev_rebuild_lock); } return (error); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW, "Max segment size in bytes of rebuild reads"); -ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW, "Max bytes in flight per leaf vdev for sequential resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, "Automatically scrub after sequential resilver completes"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index f762c1df96aa..1249657f9d72 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -94,7 +94,7 @@ typedef struct vdev_copy_arg { * doing a device removal. This determines how much i/o we can have * in flight concurrently. */ -int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; +static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; /* * The largest contiguous segment that we will attempt to allocate when @@ -104,7 +104,7 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * * See also the accessor function spa_remove_max_segment(). */ -int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; +uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device @@ -112,7 +112,7 @@ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; * not be cancelled. This can result in a normally recoverable block * becoming permanently damaged and is not recommended. */ -int zfs_removal_ignore_errors = 0; +static int zfs_removal_ignore_errors = 0; /* * Allow a remap segment to span free chunks of at most this size. The main @@ -130,7 +130,7 @@ int zfs_removal_ignore_errors = 0; * - we'll do larger allocations, which may fail and fall back on smaller * allocations */ -int vdev_removal_max_span = 32 * 1024; +uint_t vdev_removal_max_span = 32 * 1024; /* * This is used by the test suite so that it can ensure that certain @@ -140,7 +140,7 @@ int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" -static void spa_vdev_remove_thread(void *arg); +static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg); static int spa_vdev_remove_cancel_impl(spa_t *spa); static void @@ -168,8 +168,178 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) } static void -spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) +vdev_activate(vdev_t *vd) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + uint64_t vdev_space = spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + + ASSERT(!vd->vdev_islog); + ASSERT(vd->vdev_noalloc); + + metaslab_group_activate(mg); + metaslab_group_activate(vd->vdev_log_mg); + + ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space); + + spa->spa_nonallocating_dspace -= vdev_space; + + vd->vdev_noalloc = B_FALSE; +} + +static int +vdev_passivate(vdev_t *vd, uint64_t *txg) +{ + spa_t *spa = vd->vdev_spa; + int error; + + ASSERT(!vd->vdev_noalloc); + + vdev_t *rvd = spa->spa_root_vdev; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_class_t *normal = spa_normal_class(spa); + if (mg->mg_class == normal) { + /* + * We must check that this is not the only allocating device in + * the pool before passivating, otherwise we will not be able + * to make progress because we can't allocate from any vdevs. + */ + boolean_t last = B_TRUE; + for (uint64_t id = 0; id < rvd->vdev_children; id++) { + vdev_t *cvd = rvd->vdev_child[id]; + + if (cvd == vd || + cvd->vdev_ops == &vdev_indirect_ops) + continue; + + metaslab_class_t *mc = cvd->vdev_mg->mg_class; + if (mc != normal) + continue; + + if (!cvd->vdev_noalloc) { + last = B_FALSE; + break; + } + } + if (last) + return (SET_ERROR(EINVAL)); + } + + metaslab_group_passivate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * We must ensure that no "stubby" log blocks are allocated + * on the device to be removed. These blocks could be + * written at any time, including while we are in the middle + * of copying them. + */ + error = spa_reset_logs(spa); + + *txg = spa_vdev_config_enter(spa); + + if (error != 0) { + metaslab_group_activate(mg); + ASSERT(!vd->vdev_islog); + if (vd->vdev_log_mg != NULL) + metaslab_group_activate(vd->vdev_log_mg); + return (error); + } + + spa->spa_nonallocating_dspace += spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + vd->vdev_noalloc = B_TRUE; + + return (0); +} + +/* + * Turn off allocations for a top-level device from the pool. + * + * Turning off allocations for a top-level device can take a significant + * amount of time. As a result we use the spa_vdev_config_[enter/exit] + * functions which allow us to grab and release the spa_config_lock while + * still holding the namespace lock. During each step the configuration + * is synced out. + */ +int +spa_vdev_noalloc(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + uint64_t txg; + int error = 0; + + ASSERT(!MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (vd == NULL) + error = SET_ERROR(ENOENT); + else if (vd->vdev_mg == NULL) + error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP); + else if (!vd->vdev_noalloc) + error = vdev_passivate(vd, &txg); + + if (error == 0) { + vdev_dirty_leaves(vd, VDD_DTL, txg); + vdev_config_dirty(vd); + } + + error = spa_vdev_exit(spa, NULL, txg, error); + + return (error); +} + +int +spa_vdev_alloc(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + uint64_t txg; + int error = 0; + + ASSERT(!MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (vd == NULL) + error = SET_ERROR(ENOENT); + else if (vd->vdev_mg == NULL) + error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP); + else if (!vd->vdev_removing) + vdev_activate(vd); + + if (error == 0) { + vdev_dirty_leaves(vd, VDD_DTL, txg); + vdev_config_dirty(vd); + } + + (void) spa_vdev_exit(spa, NULL, txg, error); + + return (error); +} + +static void +spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev, + int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; @@ -183,7 +353,8 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); + fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev, + count - 1); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); @@ -997,11 +1168,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, metaslab_class_t *mc = mg->mg_class; if (mc->mc_groups == 0) mc = spa_normal_class(spa); - int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, - zal, 0); + int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, + METASLAB_DONT_THROTTLE, zal, 0); if (error == ENOSPC && mc != spa_normal_class(spa)) { error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, - &dst, 0, NULL, txg, 0, zal, 0); + &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); } if (error != 0) return (error); @@ -1193,6 +1364,10 @@ vdev_remove_complete(spa_t *spa) ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + vdev_rebuild_stop_wait(vd); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + uint64_t vdev_space = spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); @@ -1200,6 +1375,12 @@ vdev_remove_complete(spa_t *spa) zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)txg); + ASSERT3U(0, !=, vdev_space); + ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space); + + /* the vdev is no longer part of the dspace */ + spa->spa_nonallocating_dspace -= vdev_space; + /* * Discard allocation state. */ @@ -1207,7 +1388,6 @@ vdev_remove_complete(spa_t *spa) vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; - spa_log_sm_set_blocklimit(spa); } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); @@ -1410,7 +1590,7 @@ spa_remove_max_segment(spa_t *spa) * TXG have completed (see spa_txg_zio) and writes the new mappings to disk * (see vdev_mapping_sync()). */ -static void +static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg) { spa_t *spa = arg; @@ -1619,10 +1799,32 @@ spa_vdev_remove_suspend(spa_t *spa) mutex_exit(&svr->svr_lock); } -/* ARGSUSED */ +/* + * Return true if the "allocating" property has been set to "off" + */ +static boolean_t +vdev_prop_allocating_off(vdev_t *vd) +{ + uint64_t objid = vd->vdev_top_zap; + uint64_t allocating = 1; + + /* no vdev property object => no props */ + if (objid != 0) { + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + + mutex_enter(&spa->spa_props_lock); + (void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t), + 1, &allocating); + mutex_exit(&spa->spa_props_lock); + } + return (allocating == 0); +} + static int spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) { + (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (spa->spa_vdev_removal == NULL) @@ -1634,10 +1836,10 @@ spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) * Cancel a removal by freeing all entries from the partial mapping * and marking the vdev as no longer being removing. */ -/* ARGSUSED */ static void spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) { + (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); @@ -1761,6 +1963,13 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) spa_finish_removal(spa, DSS_CANCELED, tx); vd->vdev_removing = B_FALSE; + + if (!vdev_prop_allocating_off(vd)) { + spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); + vdev_activate(vd); + spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); + } + vdev_config_dirty(vd); zfs_dbgmsg("canceled device removal for vdev %llu in %llu", @@ -1774,21 +1983,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) static int spa_vdev_remove_cancel_impl(spa_t *spa) { - uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; - int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED); - - if (error == 0) { - spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); - vdev_t *vd = vdev_lookup_top(spa, vdid); - metaslab_group_activate(vd->vdev_mg); - ASSERT(!vd->vdev_islog); - metaslab_group_activate(vd->vdev_log_mg); - spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); - } - return (error); } @@ -1935,7 +2132,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) * metaslab_class_histogram_verify() */ vdev_metaslab_fini(vd); - spa_log_sm_set_blocklimit(spa); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); *txg = spa_vdev_config_enter(spa); @@ -1984,6 +2180,11 @@ spa_vdev_remove_top_check(vdev_t *vd) if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); + /* + * This device is already being removed + */ + if (vd->vdev_removing) + return (SET_ERROR(EALREADY)); metaslab_class_t *mc = vd->vdev_mg->mg_class; metaslab_class_t *normal = spa_normal_class(spa); @@ -2002,20 +2203,12 @@ spa_vdev_remove_top_check(vdev_t *vd) ASSERT3U(available, >=, vd->vdev_stat.vs_alloc); if (available < vd->vdev_stat.vs_alloc) return (SET_ERROR(ENOSPC)); - } else { + } else if (!vd->vdev_noalloc) { /* available space in the pool's normal class */ uint64_t available = dsl_dir_space_available( spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); - if (available < - vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { - /* - * This is a normal device. There has to be enough free - * space to remove the device and leave double the - * "slop" space (i.e. we must leave at least 3% of the - * pool free, in addition to the normal slop space). - */ + if (available < vd->vdev_stat.vs_dspace) return (SET_ERROR(ENOSPC)); - } } /* @@ -2058,7 +2251,6 @@ spa_vdev_remove_top_check(vdev_t *vd) * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; - int num_indirect = 0; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; @@ -2074,8 +2266,6 @@ spa_vdev_remove_top_check(vdev_t *vd) if (cvd->vdev_ashift != 0 && cvd->vdev_alloc_bias == VDEV_BIAS_NONE) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); - if (cvd->vdev_ops == &vdev_indirect_ops) - num_indirect++; if (!vdev_is_concrete(cvd)) continue; if (vdev_get_nparity(cvd) != 0) @@ -2108,6 +2298,7 @@ static int spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; + boolean_t set_noalloc = B_FALSE; int error; /* @@ -2116,8 +2307,6 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) * are errors. */ error = spa_vdev_remove_top_check(vd); - if (error != 0) - return (error); /* * Stop allocating from this vdev. Note that we must check @@ -2127,31 +2316,22 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) * The above check for sufficient free space serves this * purpose. */ - metaslab_group_t *mg = vd->vdev_mg; - metaslab_group_passivate(mg); - ASSERT(!vd->vdev_islog); - metaslab_group_passivate(vd->vdev_log_mg); - - /* - * Wait for the youngest allocations and frees to sync, - * and then wait for the deferral of those frees to finish. - */ - spa_vdev_config_exit(spa, NULL, - *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + if (error == 0 && !vd->vdev_noalloc) { + set_noalloc = B_TRUE; + error = vdev_passivate(vd, txg); + } - /* - * We must ensure that no "stubby" log blocks are allocated - * on the device to be removed. These blocks could be - * written at any time, including while we are in the middle - * of copying them. - */ - error = spa_reset_logs(spa); + if (error != 0) + return (error); /* * We stop any initializing and TRIM that is currently in progress * but leave the state as "active". This will allow the process to * resume if the removal is canceled sometime later. */ + + spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_wait(vd); @@ -2162,13 +2342,11 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) * Things might have changed while the config lock was dropped * (e.g. space usage). Check for errors again. */ - if (error == 0) - error = spa_vdev_remove_top_check(vd); + error = spa_vdev_remove_top_check(vd); if (error != 0) { - metaslab_group_activate(mg); - ASSERT(!vd->vdev_islog); - metaslab_group_activate(vd->vdev_log_mg); + if (set_noalloc) + vdev_activate(vd); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); @@ -2206,7 +2384,8 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) int error = 0, error_log; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); sysevent_t *ev = NULL; - char *vd_type = NULL, *vd_path = NULL; + const char *vd_type = NULL; + char *vd_path = NULL; ASSERT(spa_writeable(spa)); @@ -2235,7 +2414,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { - char *type; + const char *type; boolean_t draid_spare = B_FALSE; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) @@ -2363,17 +2542,17 @@ spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) return (0); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW, "Ignore hard IO errors when removing device"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW, "Largest contiguous segment to allocate when removing device"); -ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW, "Largest span of free chunks a remap segment can span"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW, +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW, "Pause device removal after this many bytes are copied " "(debug use only - causes removal to hang)"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_root.c b/sys/contrib/openzfs/module/zfs/vdev_root.c index 45ddc2f71927..e132643dc330 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_root.c +++ b/sys/contrib/openzfs/module/zfs/vdev_root.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c index deea7fedd770..9cf10332e8bf 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_trim.c +++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,9 +20,10 @@ */ /* - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2024 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. * Copyright (c) 2021 Hewlett Packard Enterprise Development LP + * Copyright 2023 RackTop Systems, Inc. */ #include <sys/spa.h> @@ -96,12 +97,12 @@ /* * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths. */ -unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; +static unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; /* * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped. */ -unsigned int zfs_trim_extent_bytes_min = 32 * 1024; +static unsigned int zfs_trim_extent_bytes_min = 32 * 1024; /* * Skip uninitialized metaslabs during the TRIM process. This option is @@ -118,7 +119,7 @@ unsigned int zfs_trim_metaslab_skip = 0; * concurrent TRIM I/Os issued to the device is controlled by the * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options. */ -unsigned int zfs_trim_queue_limit = 10; +static unsigned int zfs_trim_queue_limit = 10; /* * The minimum number of transaction groups between automatic trims of a @@ -134,7 +135,7 @@ unsigned int zfs_trim_queue_limit = 10; * has the opposite effect. The default value of 32 was determined though * testing to be a reasonable compromise. */ -unsigned int zfs_trim_txg_batch = 32; +static unsigned int zfs_trim_txg_batch = 32; /* * The trim_args are a control structure which describe how a leaf vdev @@ -168,7 +169,8 @@ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } /* @@ -179,10 +181,31 @@ vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || + tvd->vdev_rz_expanding || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } /* + * Wait for given number of kicks, return true if the wait is aborted due to + * vdev_autotrim_exit_wanted. + */ +static boolean_t +vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) +{ + mutex_enter(&vd->vdev_autotrim_lock); + for (int i = 0; i < num_of_kick; i++) { + if (vd->vdev_autotrim_exit_wanted) + break; + cv_wait_idle(&vd->vdev_autotrim_kick_cv, + &vd->vdev_autotrim_lock); + } + boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; + mutex_exit(&vd->vdev_autotrim_lock); + + return (exit_wanted); +} + +/* * The sync task for updating the on-disk state of a manual TRIM. This * is scheduled by vdev_trim_change_state(). */ @@ -202,7 +225,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; @@ -572,6 +596,7 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; + int error = 0; ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; @@ -591,19 +616,32 @@ vdev_trim_ranges(trim_args_t *ta) uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { - int error; - error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { - return (error); + goto done; } } } - return (0); +done: + /* + * Make sure all TRIMs for this metaslab have completed before + * returning. TRIM zios have lower priority over regular or syncing + * zios, so all TRIM zios for this metaslab must complete before the + * metaslab is re-enabled. Otherwise it's possible write zios to + * this metaslab could cut ahead of still queued TRIM zios for this + * metaslab causing corruption if the ranges overlap. + */ + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[0] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + return (error); } static void @@ -834,7 +872,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) * by its ms_allocatable. While a metaslab is undergoing trimming it is * not eligible for new allocations. */ -static void +static __attribute__((noreturn)) void vdev_trim_thread(void *arg) { vdev_t *vd = arg; @@ -922,11 +960,6 @@ vdev_trim_thread(void *arg) } spa_config_exit(spa, SCL_CONFIG, FTAG); - mutex_enter(&vd->vdev_trim_io_lock); - while (vd->vdev_trim_inflight[0] > 0) { - cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); - } - mutex_exit(&vd->vdev_trim_io_lock); range_tree_destroy(ta.trim_tree); @@ -976,6 +1009,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_rz_expanding); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, @@ -1003,9 +1037,11 @@ vdev_trim_stop_wait_impl(vdev_t *vd) void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) { + (void) spa; vdev_t *vd; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_trim_lock); @@ -1044,7 +1080,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) if (vd_list == NULL) { vdev_trim_stop_wait_impl(vd); } else { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_export_thread == curthread); list_insert_tail(vd_list, vd); } } @@ -1080,7 +1117,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) list_t vd_list; vdev_t *vd_l2cache; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_export_thread == curthread); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); @@ -1113,7 +1151,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) void vdev_trim_restart(vdev_t *vd) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + vd->vdev_spa->spa_load_thread == curthread); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { @@ -1132,12 +1171,13 @@ vdev_trim_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; - if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, @@ -1174,7 +1214,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) * N.B. This behavior is different from a manual TRIM where a thread * is created for each leaf vdev, instead of each top-level vdev. */ -static void +static __attribute__((noreturn)) void vdev_autotrim_thread(void *arg) { vdev_t *vd = arg; @@ -1187,12 +1227,10 @@ vdev_autotrim_thread(void *arg) mutex_exit(&vd->vdev_autotrim_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; - uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; - while (!vdev_autotrim_should_stop(vd)) { int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); - boolean_t issued_trim = B_FALSE; + uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; + uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; /* * All of the metaslabs are divided in to groups of size @@ -1224,6 +1262,8 @@ vdev_autotrim_thread(void *arg) i += txgs_per_trim) { metaslab_t *msp = vd->vdev_ms[i]; range_tree_t *trim_tree; + boolean_t issued_trim = B_FALSE; + boolean_t wait_aborted = B_FALSE; spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); @@ -1374,7 +1414,18 @@ vdev_autotrim_thread(void *arg) range_tree_vacate(trim_tree, NULL, NULL); range_tree_destroy(trim_tree); - metaslab_enable(msp, issued_trim, B_FALSE); + /* + * Wait for couples of kicks, to ensure the trim io is + * synced. If the wait is aborted due to + * vdev_autotrim_exit_wanted, we need to signal + * metaslab_enable() to wait for sync. + */ + if (issued_trim) { + wait_aborted = vdev_autotrim_wait_kick(vd, + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE); + } + + metaslab_enable(msp, wait_aborted, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { @@ -1388,17 +1439,14 @@ vdev_autotrim_thread(void *arg) } kmem_free(tap, sizeof (trim_args_t) * children); + + if (vdev_autotrim_should_stop(vd)) + break; } spa_config_exit(spa, SCL_CONFIG, FTAG); - /* - * After completing the group of metaslabs wait for the next - * open txg. This is done to make sure that a minimum of - * zfs_trim_txg_batch txgs will occur before these metaslabs - * are trimmed again. - */ - txg_wait_open(spa_get_dsl(spa), 0, issued_trim); + vdev_autotrim_wait_kick(vd, 1); shift++; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1454,7 +1502,8 @@ vdev_autotrim(spa_t *spa) mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && - tvd->vdev_autotrim_thread == NULL) { + tvd->vdev_autotrim_thread == NULL && + !tvd->vdev_rz_expanding) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, @@ -1476,11 +1525,9 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_enter(&tvd->vdev_autotrim_lock); if (tvd->vdev_autotrim_thread != NULL) { tvd->vdev_autotrim_exit_wanted = B_TRUE; - - while (tvd->vdev_autotrim_thread != NULL) { - cv_wait(&tvd->vdev_autotrim_cv, - &tvd->vdev_autotrim_lock); - } + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + cv_wait(&tvd->vdev_autotrim_cv, + &tvd->vdev_autotrim_lock); ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); tvd->vdev_autotrim_exit_wanted = B_FALSE; @@ -1488,6 +1535,24 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_exit(&tvd->vdev_autotrim_lock); } +void +vdev_autotrim_kick(spa_t *spa) +{ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + vdev_t *root_vd = spa->spa_root_vdev; + vdev_t *tvd; + + for (uint64_t i = 0; i < root_vd->vdev_children; i++) { + tvd = root_vd->vdev_child[i]; + + mutex_enter(&tvd->vdev_autotrim_lock); + if (tvd->vdev_autotrim_thread != NULL) + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + mutex_exit(&tvd->vdev_autotrim_lock); + } +} + /* * Wait for all of the vdev_autotrim_thread associated with the pool to * be terminated (canceled or stopped). @@ -1507,19 +1572,19 @@ vdev_autotrim_stop_all(spa_t *spa) void vdev_autotrim_restart(spa_t *spa) { - ASSERT(MUTEX_HELD(&spa_namespace_lock)); - + ASSERT(MUTEX_HELD(&spa_namespace_lock) || + spa->spa_load_thread == curthread); if (spa->spa_autotrim) vdev_autotrim(spa); } -static void +static __attribute__((noreturn)) void vdev_trim_l2arc_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; l2arc_dev_t *dev = l2arc_vdev_get(vd); - trim_args_t ta; + trim_args_t ta = {0}; range_seg64_t physical_rs; ASSERT(vdev_is_concrete(vd)); @@ -1530,7 +1595,6 @@ vdev_trim_l2arc_thread(void *arg) vd->vdev_trim_partial = 0; vd->vdev_trim_secure = 0; - bzero(&ta, sizeof (ta)); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; @@ -1590,7 +1654,7 @@ vdev_trim_l2arc_thread(void *arg) */ spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, RW_READER); - bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); + memset(dev->l2ad_dev_hdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); @@ -1654,9 +1718,9 @@ vdev_trim_l2arc(spa_t *spa) int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) { - trim_args_t ta; - range_seg64_t physical_rs; - int error; + trim_args_t ta = {0}; + range_seg64_t physical_rs; + int error; physical_rs.rs_start = start; physical_rs.rs_end = start + size; @@ -1664,8 +1728,8 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); - bzero(&ta, sizeof (ta)); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_SIMPLE; @@ -1708,19 +1772,17 @@ EXPORT_SYMBOL(vdev_autotrim_restart); EXPORT_SYMBOL(vdev_trim_l2arc); EXPORT_SYMBOL(vdev_trim_simple); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, - "Max size of TRIM commands, larger will be split"); + "Max size of TRIM commands, larger will be split"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, - "Min size of TRIM commands, smaller will be skipped"); + "Min size of TRIM commands, smaller will be skipped"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, - "Skip metaslabs which have never been initialized"); + "Skip metaslabs which have never been initialized"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, - "Min number of txgs to aggregate frees before issuing TRIM"); + "Min number of txgs to aggregate frees before issuing TRIM"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, - "Max queued TRIMs outstanding per leaf vdev"); -/* END CSTYLED */ + "Max queued TRIMs outstanding per leaf vdev"); diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c index 6f03beef3bdb..03b76ea1b7bf 100644 --- a/sys/contrib/openzfs/module/zfs/zap.c +++ b/sys/contrib/openzfs/module/zfs/zap.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,6 +22,8 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com> + * Copyright (c) 2023, Klara Inc. */ /* @@ -41,6 +43,7 @@ #include <sys/spa.h> #include <sys/dmu.h> +#include <sys/dnode.h> #include <sys/zfs_context.h> #include <sys/zfs_znode.h> #include <sys/fs/zfs.h> @@ -76,13 +79,18 @@ * the zfs-specific implementation of the directory's st_size (which is * the number of entries). */ -int zap_iterate_prefetch = B_TRUE; +static int zap_iterate_prefetch = B_TRUE; -int fzap_default_block_shift = 14; /* 16k blocksize */ +/* + * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be + * collapsed into a single block. + */ +int zap_shrink_enabled = B_TRUE; -extern inline zap_phys_t *zap_f_phys(zap_t *zap); +int fzap_default_block_shift = 14; /* 16k blocksize */ static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); +static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); void fzap_byteswap(void *vbuf, size_t size) @@ -114,7 +122,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * explicitly zero it since it might be coming from an * initialized microzap */ - bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; @@ -135,7 +143,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * set up block 1 - the first leaf */ dmu_buf_t *db; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); @@ -184,7 +192,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, tbl->zt_blk << bs, tbl->zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); } @@ -195,21 +203,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, uint64_t b = tbl->zt_blks_copied; dmu_buf_t *db_old; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ dmu_buf_t *db_new; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, @@ -257,7 +265,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); @@ -269,7 +277,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err != 0) { @@ -298,16 +306,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); - /* - * Note: this is equivalent to dmu_buf_hold(), but we use - * _dnode_enter / _by_dnode because it's faster because we don't - * have to hold the dnode. - */ - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; @@ -321,11 +322,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) */ blk = (idx*2) >> (bs-3); - dn = dmu_buf_dnode_enter(zap->zap_dbuf); - err = dmu_buf_hold_by_dnode(dn, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err == 0) dmu_buf_rele(db, FTAG); } @@ -370,7 +369,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) uint64_t newblk = zap_allocate_blocks(zap, 1); dmu_buf_t *db_new; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err != 0) @@ -426,20 +425,36 @@ zap_leaf_evict_sync(void *dbu) static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); - rw_enter(&l->l_rwlock, RW_WRITER); - l->l_blkid = zap_allocate_blocks(zap, 1); - l->l_dbuf = NULL; + uint64_t blkid = zap_allocate_blocks(zap, 1); + dmu_buf_t *db = NULL; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, - l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, + blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, DMU_READ_NO_PREFETCH)); - dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); + + /* + * Create the leaf structure and stash it on the dbuf. If zap was + * recent shrunk or truncated, the dbuf might have been sitting in the + * cache waiting to be evicted, and so still have the old leaf attached + * to it. If so, just reuse it. + */ + zap_leaf_t *l = dmu_buf_get_user(db); + if (l == NULL) { + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l->l_blkid = blkid; + l->l_dbuf = db; + rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, + &l->l_dbuf); + dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + } else { + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + } + + rw_enter(&l->l_rwlock, RW_WRITER); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); @@ -535,10 +550,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, return (SET_ERROR(ENOENT)); int bs = FZAP_BLOCK_SHIFT(zap); - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); @@ -600,6 +613,72 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) } static int +zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + int epb = bs >> 3; /* entries per block */ + int err = 0; + + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + /* + * Check for i/o errors + */ + for (int i = 0; i < nptrs; i += epb) { + uint64_t blk; + err = zap_idx_to_blk(zap, idx + i, &blk); + if (err != 0) { + return (err); + } + } + + for (int i = 0; i < nptrs; i++) { + err = zap_set_idx_to_blk(zap, idx + i, blk, tx); + ASSERT0(err); /* we checked for i/o errors above */ + if (err != 0) + break; + } + + return (err); +} + +#define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) + +/* + * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. + * If two leaves are siblings, their ranges are adjecent and contain the same + * number of entries. In order to find out if a leaf has a sibling, we need to + * check the range corresponding to the sibling leaf. There is no need to check + * all entries in the range, we only need to check the frist and the last one. + */ +static uint64_t +check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; + uint64_t nptrs = (1 << pref_diff); + uint64_t first; + uint64_t last; + + ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + + if (zap_idx_to_blk(zap, idx, &first) != 0) + return (0); + + if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) + return (0); + + if (first != last) + return (0); + return (first); +} + +static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { uint64_t blk; @@ -628,7 +707,7 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) static int zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, - void *tag, dmu_tx_t *tx, zap_leaf_t **lp) + const void *tag, dmu_tx_t *tx, zap_leaf_t **lp) { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; @@ -648,6 +727,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, uint64_t object = zap->zap_object; zap_put_leaf(l); + *lp = l = NULL; zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); @@ -717,7 +797,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, static void zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, - void *tag, dmu_tx_t *tx) + const void *tag, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; @@ -826,7 +906,7 @@ fzap_lookup(zap_name_t *zn, int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) + const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; @@ -857,28 +937,24 @@ retry: } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) { + if (err == 0) goto retry; - } else if (err == ENOSPC) { - /* - * If we failed to expand the leaf, then bailout - * as there is no point trying - * zap_put_leaf_maybe_grow_ptrtbl(). - */ - return (err); - } } out: - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, void *tag, dmu_tx_t *tx) + const void *val, const void *tag, dmu_tx_t *tx) { int err = fzap_check(zn, integer_size, num_integers); if (err != 0) @@ -891,7 +967,7 @@ fzap_add(zap_name_t *zn, int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, - void *tag, dmu_tx_t *tx) + const void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; @@ -928,8 +1004,12 @@ retry: goto retry; } - if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + if (l != NULL) { + if (err == ENOSPC) + zap_put_leaf(l); + else + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + } return (err); } @@ -948,9 +1028,9 @@ fzap_length(zap_name_t *zn, if (err != 0) goto out; - if (integer_size != 0) + if (integer_size != NULL) *integer_size = zeh.zeh_integer_size; - if (num_integers != 0) + if (num_integers != NULL) *num_integers = zeh.zeh_num_integers; out: zap_put_leaf(l); @@ -971,6 +1051,10 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx) if (err == 0) { zap_entry_remove(&zeh); zap_increment_num_entries(zn->zn_zap, -1, tx); + + if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && + zap_shrink_enabled) + return (zap_shrink(zn, l, tx)); } zap_put_leaf(l); return (err); @@ -987,7 +1071,7 @@ fzap_prefetch(zap_name_t *zn) if (zap_idx_to_blk(zap, idx, &blk) != 0) return; int bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } @@ -1230,18 +1314,24 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) */ if (zc->zc_hash == 0 && zap_iterate_prefetch && zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { - dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), ZIO_PRIORITY_ASYNC_READ); } - if (zc->zc_leaf && - (ZAP_HASH_IDX(zc->zc_hash, - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != - zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); - zap_put_leaf(zc->zc_leaf); - zc->zc_leaf = NULL; + + /* + * The leaf was either shrunk or split. + */ + if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || + (ZAP_HASH_IDX(zc->zc_hash, + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } } again: @@ -1250,8 +1340,6 @@ again: &zc->zc_leaf); if (err != 0) return (err); - } else { - rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); } l = zc->zc_leaf; @@ -1358,7 +1446,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); @@ -1368,7 +1456,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) dmu_buf_t *db; int err; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { @@ -1380,7 +1468,242 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } -/* BEGIN CSTYLED */ +/* + * Find last allocated block and update freeblk. + */ +static void +zap_trunc(zap_t *zap) +{ + uint64_t nentries; + uint64_t lastblk; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { + /* External ptrtbl */ + nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); + lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + + zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; + } else { + /* Embedded ptrtbl */ + nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + lastblk = 0; + } + + for (uint64_t idx = 0; idx < nentries; idx++) { + uint64_t blk; + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + if (blk > lastblk) + lastblk = blk; + } + + ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); + + zap_f_phys(zap)->zap_freeblk = lastblk + 1; +} + +/* + * ZAP shrinking algorithm. + * + * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf + * only if it has a sibling. Sibling leaves have the same prefix length and + * their prefixes differ only by the least significant (sibling) bit. We require + * both siblings to be empty. This eliminates a need to rehash the non-empty + * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl + * entries of the removed leaf to point out to the remaining leaf. Prefix length + * of the remaining leaf is decremented. As a result, it has a new prefix and it + * might have a new sibling. So, we repeat the process. + * + * Steps: + * 1. Check if a sibling leaf (sl) exists and it is empty. + * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. + * 3. Release the sibling (sl) to derefer it again with WRITER lock. + * 4. Upgrade zapdir lock to WRITER (once). + * 5. Derefer released leaves again. + * 6. If it is needed, recheck whether both leaves are still siblings and empty. + * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of + * the remaining leaf (slbit 0). + * 8. Free disk block of the removed leaf (dmu_free_range). + * 9. Decrement prefix_len of the remaining leaf. + * 10. Repeat the steps. + */ +static int +zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + uint64_t hash = zn->zn_hash; + uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + boolean_t trunc = B_FALSE; + int err = 0; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); + + boolean_t writer = B_FALSE; + + /* + * To avoid deadlock always deref leaves in the same order - + * sibling 0 first, then sibling 1. + */ + while (prefix_len) { + zap_leaf_t *sl; + int64_t prefix_diff = zt_shift - prefix_len; + uint64_t sl_prefix = prefix ^ 1; + uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); + int slbit = prefix & 1; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + + /* + * Check if there is a sibling by reading ptrtbl ptrs. + */ + if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) + break; + + /* + * sibling 1, unlock it - we haven't yet dereferenced sibling 0. + */ + if (slbit == 1) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Dereference sibling leaf and check if it is empty. + */ + if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, + &sl)) != 0) + break; + + ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); + + /* + * Check if we have a sibling and it is empty. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || + zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { + zap_put_leaf(sl); + break; + } + + zap_put_leaf(sl); + + /* + * If there two empty sibling, we have work to do, so + * we need to lock ZAP ptrtbl as WRITER. + */ + if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { + /* We failed to upgrade */ + if (l != NULL) { + zap_put_leaf(l); + l = NULL; + } + + /* + * Usually, the right way to upgrade from a READER lock + * to a WRITER lock is to call zap_unlockdir() and + * zap_lockdir(), but we do not have a tag. Instead, + * we do it in more sophisticated way. + */ + rw_exit(&zap->zap_rwlock); + rw_enter(&zap->zap_rwlock, RW_WRITER); + dmu_buf_will_dirty(zap->zap_dbuf, tx); + + zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + writer = B_TRUE; + } + + /* + * Here we have WRITER lock for ptrtbl. + * Now, we need a WRITER lock for both siblings leaves. + * Also, we have to recheck if the leaves are still siblings + * and still empty. + */ + if (l == NULL) { + /* sibling 0 */ + if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), + tx, RW_WRITER, &l)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) + break; + } + + /* sibling 1 */ + if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, + RW_WRITER, &sl)) != 0) + break; + + /* + * The leaf isn't empty anymore or + * it was shrunk/split while our locks were down. + */ + if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || + zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { + zap_put_leaf(sl); + break; + } + + /* If we have gotten here, we have a leaf to collapse */ + uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; + uint64_t nptrs = (1ULL << prefix_diff); + uint64_t sl_blkid = sl->l_blkid; + + /* + * Set ptrtbl entries to point out to the slibling 0 blkid + */ + if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, + tx)) != 0) { + zap_put_leaf(sl); + break; + } + + /* + * Free sibling 1 disk block. + */ + int bs = FZAP_BLOCK_SHIFT(zap); + if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) + trunc = B_TRUE; + + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + sl_blkid << bs, 1 << bs, tx); + zap_put_leaf(sl); + + zap_f_phys(zap)->zap_num_leafs--; + + /* + * Update prefix and prefix_len. + */ + zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len--; + + prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; + prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + } + + if (trunc) + zap_trunc(zap); + + if (l != NULL) + zap_put_leaf(l); + + return (err); +} + +/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, "When iterating ZAP object, prefetch it"); -/* END CSTYLED */ + +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, + "Enable ZAP shrinking"); diff --git a/sys/contrib/openzfs/module/zfs/zap_leaf.c b/sys/contrib/openzfs/module/zfs/zap_leaf.c index aa6c298c3b4b..032aca92695e 100644 --- a/sys/contrib/openzfs/module/zfs/zap_leaf.c +++ b/sys/contrib/openzfs/module/zfs/zap_leaf.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -41,7 +41,8 @@ #include <sys/zap_leaf.h> #include <sys/arc.h> -static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); +static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, + uint16_t entry); #define CHAIN_END 0xffff /* end of the chunk chain */ @@ -52,18 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); #define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) -extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l); - -static void -zap_memset(void *a, int c, size_t n) -{ - char *cp = a; - char *cpend = cp + n; - - while (cp < cpend) - *cp++ = c; -} - static void stv(int len, void *addr, uint64_t value) { @@ -81,7 +70,7 @@ stv(int len, void *addr, uint64_t value) *(uint64_t *)addr = value; return; default: - cmn_err(CE_PANIC, "bad int len %d", len); + PANIC("bad int len %d", len); } } @@ -98,13 +87,13 @@ ldv(int len, const void *addr) case 8: return (*(uint64_t *)addr); default: - cmn_err(CE_PANIC, "bad int len %d", len); + PANIC("bad int len %d", len); } return (0xFEEDFACEDEADBEEFULL); } void -zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) +zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size) { zap_leaf_t l; dmu_buf_t l_dbuf; @@ -121,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); struct zap_leaf_entry *le; @@ -162,11 +151,11 @@ void zap_leaf_init(zap_leaf_t *l, boolean_t sort) { l->l_bs = highbit64(l->l_dbuf->db_size) - 1; - zap_memset(&zap_leaf_phys(l)->l_hdr, 0, + memset(&zap_leaf_phys(l)->l_hdr, 0, sizeof (struct zap_leaf_header)); - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } @@ -187,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l) { ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); - int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; + uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); @@ -209,7 +198,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) zlf->lf_type = ZAP_CHUNK_FREE; zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; - bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ + memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */ zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; zap_leaf_phys(l)->l_hdr.lh_nfree++; @@ -225,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf, { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; - int byten = 0; + int byten = integer_size; uint64_t value = 0; int shift = (integer_size - 1) * 8; int len = num_integers; ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN); + if (len > 0) + value = ldv(integer_size, buf); while (len > 0) { uint16_t chunk = zap_leaf_chunk_alloc(l); struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; la->la_type = ZAP_CHUNK_ARRAY; for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { - if (byten == 0) - value = ldv(integer_size, buf); la->la_array[i] = value >> shift; value <<= 8; - if (++byten == integer_size) { - byten = 0; - buf += integer_size; + if (--byten == 0) { if (--len == 0) break; + byten = integer_size; + buf += integer_size; + value = ldv(integer_size, buf); } } @@ -266,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) *chunkp = CHAIN_END; while (chunk != CHAIN_END) { - int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; + uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, ZAP_CHUNK_ARRAY); zap_leaf_chunk_free(l, chunk); @@ -306,7 +296,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, while (chunk != CHAIN_END) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); + memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES); p += ZAP_LEAF_ARRAY_BYTES; chunk = la->la_next; } @@ -317,7 +307,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { @@ -335,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, static boolean_t zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, - int chunk, int array_numints) + uint_t chunk, int array_numints) { int bseen = 0; @@ -346,7 +336,7 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, sizeof (*thiskey), array_numints, thiskey); - boolean_t match = bcmp(thiskey, zn->zn_key_orig, + boolean_t match = memcmp(thiskey, zn->zn_key_orig, array_numints * sizeof (*thiskey)) == 0; kmem_free(thiskey, array_numints * sizeof (*thiskey)); return (match); @@ -374,7 +364,8 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) + if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen, + toread)) break; chunk = la->la_next; bseen += toread; @@ -563,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint64_t valuelen = integer_size * num_integers; - int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (SET_ERROR(E2BIG)); @@ -625,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, /* link it into the hash chain */ /* XXX if we did the search above, we could just use that */ - uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); + uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk); zap_leaf_phys(l)->l_hdr.lh_nentries++; @@ -647,7 +638,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, * form of the name. But all callers have one of these on hand anyway, * so might as well take advantage. A cleaner but slower interface * would accept neither argument, and compute the normalized name as - * needed (using zap_name_alloc(zap_entry_read_name(zeh))). + * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))). */ boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, @@ -668,7 +659,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, continue; if (zn == NULL) { - zn = zap_name_alloc(zap, name, MT_NORMALIZE); + zn = zap_name_alloc_str(zap, name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_leaf_array_match(zeh->zeh_leaf, zn, @@ -688,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, */ static uint16_t * -zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry) { - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); struct zap_leaf_entry *le2; uint16_t *chunkp; @@ -723,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) &ZAP_LEAF_CHUNK(nl, nchunk).l_array; struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int nextchunk = la->la_next; + uint_t nextchunk = la->la_next; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); @@ -740,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) } static void -zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) +zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); @@ -749,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ - (void) zap_leaf_rehash_entry(nl, chunk); + (void) zap_leaf_rehash_entry(nl, nle, chunk); nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); nle->le_value_chunk = @@ -767,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { - int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; + uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; @@ -778,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* break existing hash chains */ - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) @@ -793,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) * but this accesses memory more sequentially, and when we're * called, the block is usually pretty full. */ - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); if (le->le_type != ZAP_CHUNK_ENTRY) continue; @@ -801,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) if (le->le_hash & (1ULL << bit)) zap_leaf_transfer_entry(l, i, nl); else - (void) zap_leaf_rehash_entry(l, i); + (void) zap_leaf_rehash_entry(l, le, i); } } void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { - int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; @@ -824,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_n_tenths_full[n]++; - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { - int nentries = 0; - int chunk = zap_leaf_phys(l)->l_hash[i]; + for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + uint_t nentries = 0; + uint_t chunk = zap_leaf_phys(l)->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c index b4611685b204..d806988af96d 100644 --- a/sys/contrib/openzfs/module/zfs/zap_micro.c +++ b/sys/contrib/openzfs/module/zfs/zap_micro.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -33,7 +33,7 @@ #include <sys/zap.h> #include <sys/zap_impl.h> #include <sys/zap_leaf.h> -#include <sys/avl.h> +#include <sys/btree.h> #include <sys/arc.h> #include <sys/dmu_objset.h> @@ -41,10 +41,10 @@ #include <sys/sunddi.h> #endif -extern inline mzap_phys_t *zap_m_phys(zap_t *zap); +int zap_micro_max_size = MZAP_MAX_BLKSZ; static int mzap_upgrade(zap_t **zapp, - void *tag, dmu_tx_t *tx, zap_flags_t flags); + const void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) @@ -94,7 +94,7 @@ zap_hash(zap_name_t *zn) wp++, i++) { uint64_t word = *wp; - for (int j = 0; j < zn->zn_key_intlen; j++) { + for (int j = 0; j < 8; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; @@ -164,18 +164,25 @@ zap_match(zap_name_t *zn, const char *matchname) } } +static zap_name_t * +zap_name_alloc(zap_t *zap) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + zn->zn_zap = zap; + return (zn); +} + void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } -zap_name_t * -zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) +static int +zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) { - zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + zap_t *zap = zn->zn_zap; - zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; @@ -196,17 +203,13 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, - zap->zap_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } + zap->zap_normflags) != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { - if (mt != 0) { - zap_name_free(zn); - return (NULL); - } + if (mt != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } @@ -219,13 +222,22 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, - zn->zn_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } + zn->zn_normflags) != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } + return (0); +} + +zap_name_t * +zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) +{ + zap_name_t *zn = zap_name_alloc(zap); + if (zap_name_init_str(zn, key, mt) != 0) { + zap_name_free(zn); + return (NULL); + } return (zn); } @@ -273,51 +285,56 @@ zap_byteswap(void *buf, size_t size) } } +__attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); - if (likely(cmp)) - return (cmp); - - return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); + return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, + (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, + mze_compare) + static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash) +mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { + mzap_ent_t mze; + ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); - mze->mze_chunkid = chunkid; - mze->mze_hash = hash; - mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; - ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); - avl_add(&zap->zap_m.zap_avl, mze); + mze.mze_chunkid = chunkid; + ASSERT0(hash & 0xffffffff); + mze.mze_hash = hash >> 32; + ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); + mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; + ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); + zfs_btree_add(&zap->zap_m.zap_tree, &mze); } static mzap_ent_t * -mze_find(zap_name_t *zn) +mze_find(zap_name_t *zn, zfs_btree_index_t *idx) { mzap_ent_t mze_tofind; mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; + zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - mze_tofind.mze_hash = zn->zn_hash; + ASSERT0(zn->zn_hash & 0xffffffff); + mze_tofind.mze_hash = zn->zn_hash >> 32; mze_tofind.mze_cd = 0; - mze = avl_find(avl, &mze_tofind, &idx); + mze = zfs_btree_find(tree, &mze_tofind, idx); if (mze == NULL) - mze = avl_nearest(avl, idx, AVL_AFTER); - for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { + mze = zfs_btree_next(tree, idx, idx); + for (; mze && mze->mze_hash == mze_tofind.mze_hash; + mze = zfs_btree_next(tree, idx, idx)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); @@ -330,18 +347,21 @@ static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; + zfs_btree_index_t idx; + zfs_btree_t *tree = &zap->zap_m.zap_tree; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT0(hash & 0xffffffff); + hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; uint32_t cd = 0; - for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); + mze && mze->mze_hash == hash; + mze = zfs_btree_next(tree, &idx, &idx)) { if (mze->mze_cd != cd) break; cd++; @@ -366,16 +386,18 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) { zap_t *zap = zn->zn_zap; mzap_ent_t mze_tofind; - mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; + zfs_btree_index_t idx; + zfs_btree_t *tree = &zap->zap_m.zap_tree; uint32_t mzap_ents = 0; + ASSERT0(hash & 0xffffffff); + hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; - for (mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); + mze && mze->mze_hash == hash; + mze = zfs_btree_next(tree, &idx, &idx)) { mzap_ents++; } @@ -386,28 +408,14 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) } static void -mze_remove(zap_t *zap, mzap_ent_t *mze) -{ - ASSERT(zap->zap_ismicro); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - avl_remove(&zap->zap_m.zap_avl, mze); - kmem_free(mze, sizeof (mzap_ent_t)); -} - -static void mze_destroy(zap_t *zap) { - mzap_ent_t *mze; - void *avlcookie = NULL; - - while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))) - kmem_free(mze, sizeof (mzap_ent_t)); - avl_destroy(&zap->zap_m.zap_avl); + zfs_btree_clear(&zap->zap_m.zap_tree); + zfs_btree_destroy(&zap->zap_m.zap_tree); } static zap_t * -mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +mzap_open(dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; @@ -419,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); - zap->zap_objset = os; - zap->zap_object = obj; + zap->zap_objset = dmu_buf_get_objset(db); + zap->zap_object = db->db_object; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { @@ -450,21 +458,26 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; - avl_create(&zap->zap_m.zap_avl, mze_compare, - sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); - for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { + /* + * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() + * overhead on massive inserts below. It still allows to store + * 62 entries before we have to add 2KB B-tree core node. + */ + zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, + mze_find_in_buf, sizeof (mzap_ent_t), 512); + + zap_name_t *zn = zap_name_alloc(zap); + for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { - zap_name_t *zn; - zap->zap_m.zap_num_entries++; - zn = zap_name_alloc(zap, mze->mze_name, 0); + zap_name_init_str(zn, mze->mze_name, 0); mze_insert(zap, i, zn->zn_hash); - zap_name_free(zn); } } + zap_name_free(zn); } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; @@ -505,7 +518,7 @@ handle_winner: * have the specified tag. */ static int -zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, +zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); @@ -515,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, *zapp = NULL; - dmu_object_info_from_db(db, &doi); + dmu_object_info_from_dnode(dn, &doi); if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) return (SET_ERROR(EINVAL)); zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { - zap = mzap_open(os, obj, db); + zap = mzap_open(db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. @@ -550,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, } zap->zap_objset = os; + zap->zap_dnode = dn; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); @@ -561,7 +575,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; - if (newsz > MZAP_MAX_BLKSZ) { + if (newsz > zap_micro_max_size) { dprintf("upgrading obj %llu: num_entries=%u\n", (u_longlong_t)obj, zap->zap_m.zap_num_entries); *zapp = zap; @@ -581,60 +595,58 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, static int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp) { dmu_buf_t *db; + int err; - int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) { + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) return (err); - } -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif - - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) { + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) dmu_buf_rele(db, tag); - } + else + VERIFY(dnode_add_ref(dn, tag)); return (err); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, - krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) + krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, + zap_t **zapp) { + dnode_t *dn; dmu_buf_t *db; + int err; - int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + err = dnode_hold(os, obj, tag, &dn); if (err != 0) return (err); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, tag); + return (err); } -#endif - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { dmu_buf_rele(db, tag); + dnode_rele(dn, tag); + } return (err); } void -zap_unlockdir(zap_t *zap, void *tag) +zap_unlockdir(zap_t *zap, const void *tag) { rw_exit(&zap->zap_rwlock); + dnode_rele(zap->zap_dnode, tag); dmu_buf_rele(zap->zap_dbuf, tag); } static int -mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) +mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; zap_t *zap = *zapp; @@ -643,7 +655,7 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) int sz = zap->zap_dbuf->db_size; mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); - bcopy(zap->zap_dbuf->db_data, mzp, sz); + memcpy(mzp, zap->zap_dbuf->db_data, sz); int nchunks = zap->zap_m.zap_num_chunks; if (!flags) { @@ -657,24 +669,25 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) dprintf("upgrading obj=%llu with %u chunks\n", (u_longlong_t)zap->zap_object, nchunks); - /* XXX destroy the avl later, so we can use the stored hash value */ + /* XXX destroy the tree later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); + zap_name_t *zn = zap_name_alloc(zap); for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, (u_longlong_t)mze->mze_value); - zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); + zap_name_init_str(zn, mze->mze_name, 0); /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx)); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ - zap_name_free(zn); } + zap_name_free(zn); vmem_free(mzp, sz); *zapp = zap; return (0); @@ -714,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ - VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, + VERIFY(dnode_add_ref(dn, FTAG)); + VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); @@ -727,7 +741,7 @@ static uint64_t zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) + dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { uint64_t obj; @@ -859,7 +873,7 @@ uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) + dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, @@ -916,22 +930,23 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t -mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) +mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, + zfs_btree_index_t *idx) { - int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; + mzap_ent_t *other; + zfs_btree_index_t oidx; if (zap->zap_normflags == 0) return (B_FALSE); -again: - for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); other && other->mze_hash == mze->mze_hash; - other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { + other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { if (zn == NULL) { - zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, - MT_NORMALIZE); + zn = zap_name_alloc_str(zap, + MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { @@ -941,9 +956,20 @@ again: } } - if (direction == AVL_BEFORE) { - direction = AVL_AFTER; - goto again; + for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); + other && other->mze_hash == mze->mze_hash; + other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { + + if (zn == NULL) { + zn = zap_name_alloc_str(zap, + MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); + allocdzn = B_TRUE; + } + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } } if (allocdzn) @@ -971,7 +997,7 @@ zap_lookup_impl(zap_t *zap, const char *name, { int err = 0; - zap_name_t *zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); @@ -979,7 +1005,8 @@ zap_lookup_impl(zap_t *zap, const char *name, err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { @@ -990,11 +1017,13 @@ zap_lookup_impl(zap_t *zap, const char *name, } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; - (void) strlcpy(realname, - MZE_PHYS(zap, mze)->mze_name, rn_len); + if (realname != NULL) + (void) strlcpy(realname, + MZE_PHYS(zap, mze)->mze_name, + rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, - zn, mze); + zn, mze, &idx); } } } @@ -1031,7 +1060,7 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); - zn = zap_name_alloc(zap, name, 0); + zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1134,7 +1163,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1142,7 +1171,8 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { @@ -1182,7 +1212,7 @@ static void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; - int start = zap->zap_m.zap_alloc_next; + uint16_t start = zap->zap_m.zap_alloc_next; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -1198,7 +1228,7 @@ mzap_addent(zap_name_t *zn, uint64_t value) ASSERT(cd < zap_maxcd(zap)); again: - for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { + for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; @@ -1224,12 +1254,12 @@ again: static int zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx, void *tag) + const void *val, dmu_tx_t *tx, const void *tag) { const uint64_t *intval = val; int err = 0; - zap_name_t *zn = zap_name_alloc(zap, key, 0); + zap_name_t *zn = zap_name_alloc_str(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); @@ -1247,7 +1277,8 @@ zap_add_impl(zap_t *zap, const char *key, } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { - if (mze_find(zn) != NULL) { + zfs_btree_index_t idx; + if (mze_find(zn, &idx) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); @@ -1292,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key, return (err); } +static int +zap_add_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, @@ -1303,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, FTAG); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1327,7 +1388,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1348,7 +1409,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze != NULL) { MZE_PHYS(zap, mze)->mze_value = *intval; } else { @@ -1362,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, return (err); } -int -zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +static int +zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, + const void *tag) { - zap_t *zap; + int err; - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } - err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); + err = fzap_update(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, const void *val, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1398,20 +1489,20 @@ zap_remove_impl(zap_t *zap, const char *name, { int err = 0; - zap_name_t *zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; - bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], - sizeof (mzap_ent_phys_t)); - mze_remove(zap, mze); + memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); + zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); } } zap_name_free(zn); @@ -1447,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) return (err); } +static int +zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap, tag); + return (err); +} + int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) @@ -1457,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_remove(zn, tx); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1582,29 +1699,30 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { - avl_index_t idx; + zfs_btree_index_t idx; mzap_ent_t mze_tofind; - mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_hash = zc->zc_hash >> 32; mze_tofind.mze_cd = zc->zc_cd; - mzap_ent_t *mze = - avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, + &mze_tofind, &idx); if (mze == NULL) { - mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, - idx, AVL_AFTER); + mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, + &idx, &idx); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = - mzap_normalization_conflict(zc->zc_zap, NULL, mze); + mzap_normalization_conflict(zc->zc_zap, NULL, + mze, &idx); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strlcpy(za->za_name, mzep->mze_name, sizeof (za->za_name)); - zc->zc_hash = mze->mze_hash; + zc->zc_hash = (uint64_t)mze->mze_hash << 32; zc->zc_cd = mze->mze_cd; err = 0; } else { @@ -1634,7 +1752,7 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) if (err != 0) return (err); - bzero(zs, sizeof (zap_stats_t)); + memset(zs, 0, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; @@ -1669,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_add_uint64_by_dnode); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_update_uint64_by_dnode); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_remove_uint64_by_dnode); EXPORT_SYMBOL(zap_count); EXPORT_SYMBOL(zap_value_search); EXPORT_SYMBOL(zap_join); @@ -1695,4 +1816,8 @@ EXPORT_SYMBOL(zap_cursor_advance); EXPORT_SYMBOL(zap_cursor_serialize); EXPORT_SYMBOL(zap_cursor_init_serialized); EXPORT_SYMBOL(zap_get_stats); + +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, + "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); #endif diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c index f724b44baf1d..7c279162a9d1 100644 --- a/sys/contrib/openzfs/module/zfs/zcp.c +++ b/sys/contrib/openzfs/module/zfs/zcp.c @@ -108,9 +108,9 @@ #define ZCP_NVLIST_MAX_DEPTH 20 -uint64_t zfs_lua_check_instrlimit_interval = 100; -unsigned long zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT; -unsigned long zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT; +static const uint64_t zfs_lua_check_instrlimit_interval = 100; +uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT; +uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT; /* * Forward declarations for mutually recursive functions @@ -277,9 +277,9 @@ zcp_table_to_nvlist(lua_State *state, int index, int depth) } break; case LUA_TNUMBER: - VERIFY3U(sizeof (buf), >, - snprintf(buf, sizeof (buf), "%lld", - (longlong_t)lua_tonumber(state, -2))); + (void) snprintf(buf, sizeof (buf), "%lld", + (longlong_t)lua_tonumber(state, -2)); + key = buf; if (saw_str_could_collide) { key_could_collide = B_TRUE; @@ -544,7 +544,7 @@ zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair, fnvpair_value_nvlist(pair), errbuf, errbuf_len); break; case DATA_TYPE_STRING_ARRAY: { - char **strarr; + const char **strarr; uint_t nelem; (void) nvpair_value_string_array(pair, &strarr, &nelem); lua_newtable(state); @@ -622,7 +622,7 @@ zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname, */ dsl_dataset_t * zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname, - void *tag) + const void *tag) { dsl_dataset_t *ds; int error = dsl_dataset_hold(dp, dsname, tag, &ds); @@ -631,11 +631,11 @@ zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname, } static int zcp_debug(lua_State *); -static zcp_lib_info_t zcp_debug_info = { +static const zcp_lib_info_t zcp_debug_info = { .name = "debug", .func = zcp_debug, .pargs = { - { .za_name = "debug string", .za_lua_type = LUA_TSTRING}, + { .za_name = "debug string", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -648,7 +648,7 @@ zcp_debug(lua_State *state) { const char *dbgstring; zcp_run_info_t *ri = zcp_run_info(state); - zcp_lib_info_t *libinfo = &zcp_debug_info; + const zcp_lib_info_t *libinfo = &zcp_debug_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); @@ -661,11 +661,11 @@ zcp_debug(lua_State *state) } static int zcp_exists(lua_State *); -static zcp_lib_info_t zcp_exists_info = { +static const zcp_lib_info_t zcp_exists_info = { .name = "exists", .func = zcp_exists, .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -678,7 +678,7 @@ zcp_exists(lua_State *state) { zcp_run_info_t *ri = zcp_run_info(state); dsl_pool_t *dp = ri->zri_pool; - zcp_lib_info_t *libinfo = &zcp_exists_info; + const zcp_lib_info_t *libinfo = &zcp_exists_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); @@ -769,10 +769,10 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) } } -/* ARGSUSED */ static void zcp_lua_counthook(lua_State *state, lua_Debug *ar) { + (void) ar; lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); zcp_run_info_t *ri = lua_touserdata(state, -1); @@ -780,8 +780,7 @@ zcp_lua_counthook(lua_State *state, lua_Debug *ar) * Check if we were canceled while waiting for the * txg to sync or from our open context thread */ - if (ri->zri_canceled || - (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) { + if (ri->zri_canceled || (!ri->zri_sync && issig())) { ri->zri_canceled = B_TRUE; (void) lua_pushstring(state, "Channel program was canceled."); (void) lua_error(state); @@ -958,12 +957,12 @@ zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri) } static void -zcp_pool_error(zcp_run_info_t *ri, const char *poolname) +zcp_pool_error(zcp_run_info_t *ri, const char *poolname, int error) { ri->zri_result = SET_ERROR(ECHRNG); lua_settop(ri->zri_state, 0); - (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s", - poolname); + (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s " + "errno: %d", poolname, error); zcp_convert_return_values(ri->zri_state, ri->zri_outnvl, ZCP_RET_ERROR, &ri->zri_result); @@ -974,10 +973,10 @@ zcp_pool_error(zcp_run_info_t *ri, const char *poolname) * The txg_wait_synced_sig will continue to wait for the txg to complete * after calling this callback. */ -/* ARGSUSED */ static void zcp_eval_sig(void *arg, dmu_tx_t *tx) { + (void) tx; zcp_run_info_t *ri = arg; ri->zri_canceled = B_TRUE; @@ -1013,7 +1012,7 @@ zcp_eval_open(zcp_run_info_t *ri, const char *poolname) error = dsl_pool_hold(poolname, FTAG, &dp); if (error != 0) { - zcp_pool_error(ri, poolname); + zcp_pool_error(ri, poolname, error); return; } @@ -1159,7 +1158,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync, err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync, zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL); if (err != 0) - zcp_pool_error(&runinfo, poolname); + zcp_pool_error(&runinfo, poolname, err); } else { zcp_eval_open(&runinfo, poolname); } @@ -1443,10 +1442,8 @@ zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, } } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, U64, ZMOD_RW, "Max instruction limit that can be specified for a channel program"); -ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, U64, ZMOD_RW, "Max memory limit that can be specified for a channel program"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zcp_get.c b/sys/contrib/openzfs/module/zfs/zcp_get.c index 7256e4de1915..6fd45151d92a 100644 --- a/sys/contrib/openzfs/module/zfs/zcp_get.c +++ b/sys/contrib/openzfs/module/zfs/zcp_get.c @@ -76,9 +76,8 @@ get_objset_type(dsl_dataset_t *ds, zfs_type_t *type) static int get_objset_type_name(dsl_dataset_t *ds, char *str) { - int error; - zfs_type_t type; - error = get_objset_type(ds, &type); + zfs_type_t type = ZFS_TYPE_INVALID; + int error = get_objset_type(ds, &type); if (error != 0) return (error); switch (type) { @@ -230,7 +229,7 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); char setpoint[ZFS_MAX_DATASET_NAME_LEN] = "Internal error - setpoint not determined"; - zfs_type_t ds_type; + zfs_type_t ds_type = ZFS_TYPE_INVALID; zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); (void) get_objset_type(ds, &ds_type); @@ -344,19 +343,13 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, } break; case ZFS_PROP_RECEIVE_RESUME_TOKEN: { - char *token = get_receive_resume_stats_impl(ds); - - (void) strlcpy(strval, token, ZAP_MAXVALUELEN); - if (strcmp(strval, "") == 0) { - char *childval = get_child_receive_stats(ds); - - (void) strlcpy(strval, childval, ZAP_MAXVALUELEN); - if (strcmp(strval, "") == 0) - error = ENOENT; - - kmem_strfree(childval); + char *token = get_receive_resume_token(ds); + if (token != NULL) { + (void) strlcpy(strval, token, ZAP_MAXVALUELEN); + kmem_strfree(token); + } else { + error = ENOENT; } - kmem_strfree(token); break; } case ZFS_PROP_VOLSIZE: @@ -398,7 +391,7 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, dsl_dataset_crypt_stats(ds, nvl); if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop), &propval) == 0) { - char *source; + const char *source; (void) nvlist_lookup_uint64(propval, ZPROP_VALUE, &numval); @@ -410,6 +403,10 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, break; } + case ZFS_PROP_SNAPSHOTS_CHANGED: + numval = dsl_dir_snap_cmtime(ds->ds_dir).tv_sec; + break; + default: /* Did not match these props, check in the dsl_dir */ error = get_dsl_dir_prop(ds, zfs_prop, &numval); @@ -470,11 +467,13 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) } else { error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), 1, &numval, setpoint); - + if (error != 0) + goto out; #ifdef _KERNEL /* Fill in temporary value for prop, if applicable */ (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint); #else + kmem_free(strval, ZAP_MAXVALUELEN); return (luaL_error(state, "temporary properties only supported in kernel mode", prop_name)); @@ -491,6 +490,7 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) (void) lua_pushnumber(state, numval); } } +out: kmem_free(strval, ZAP_MAXVALUELEN); if (error == 0) get_prop_src(state, setpoint, zfs_prop); @@ -503,8 +503,7 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) { - int error; - zfs_type_t zfs_type; + zfs_type_t zfs_type = ZFS_TYPE_INVALID; /* properties not supported */ if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) || @@ -515,7 +514,7 @@ prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir))) return (B_FALSE); - error = get_objset_type(ds, &zfs_type); + int error = get_objset_type(ds, &zfs_type); if (error != 0) return (B_FALSE); return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE)); @@ -611,8 +610,7 @@ parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, */ int domain_len = strrchr(cp, '-') - cp; domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); - (void) strncpy(domain_val, cp, domain_len); - domain_val[domain_len] = '\0'; + (void) strlcpy(domain_val, cp, domain_len + 1); cp += domain_len + 1; (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); @@ -743,12 +741,12 @@ zcp_get_written_prop(lua_State *state, dsl_pool_t *dp, } static int zcp_get_prop(lua_State *state); -static zcp_lib_info_t zcp_get_prop_info = { +static const zcp_lib_info_t zcp_get_prop_info = { .name = "get_prop", .func = zcp_get_prop, .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, - { .za_name = "property", .za_lua_type = LUA_TSTRING}, + { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, + { .za_name = "property", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -762,7 +760,7 @@ zcp_get_prop(lua_State *state) const char *dataset_name; const char *property_name; dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - zcp_lib_info_t *libinfo = &zcp_get_prop_info; + const zcp_lib_info_t *libinfo = &zcp_get_prop_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); diff --git a/sys/contrib/openzfs/module/zfs/zcp_iter.c b/sys/contrib/openzfs/module/zfs/zcp_iter.c index f727c56f212d..2da0bf9740e5 100644 --- a/sys/contrib/openzfs/module/zfs/zcp_iter.c +++ b/sys/contrib/openzfs/module/zfs/zcp_iter.c @@ -107,12 +107,12 @@ zcp_clones_iter(lua_State *state) } static int zcp_clones_list(lua_State *); -static zcp_list_info_t zcp_clones_list_info = { +static const zcp_list_info_t zcp_clones_list_info = { .name = "clones", .func = zcp_clones_list, .gc = NULL, .pargs = { - { .za_name = "snapshot", .za_lua_type = LUA_TSTRING}, + { .za_name = "snapshot", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -194,12 +194,12 @@ zcp_snapshots_iter(lua_State *state) } static int zcp_snapshots_list(lua_State *); -static zcp_list_info_t zcp_snapshots_list_info = { +static const zcp_list_info_t zcp_snapshots_list_info = { .name = "snapshots", .func = zcp_snapshots_list, .gc = NULL, .pargs = { - { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, + { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -281,12 +281,12 @@ zcp_children_iter(lua_State *state) } static int zcp_children_list(lua_State *); -static zcp_list_info_t zcp_children_list_info = { +static const zcp_list_info_t zcp_children_list_info = { .name = "children", .func = zcp_children_list, .gc = NULL, .pargs = { - { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, + { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -333,7 +333,7 @@ zcp_user_props_list_gc(lua_State *state) static int zcp_user_props_iter(lua_State *state) { - char *source, *val; + const char *source, *val; nvlist_t *nvprop; nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1)); nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2)); @@ -361,13 +361,13 @@ zcp_user_props_iter(lua_State *state) } static int zcp_user_props_list(lua_State *); -static zcp_list_info_t zcp_user_props_list_info = { +static const zcp_list_info_t zcp_user_props_list_info = { .name = "user_properties", .func = zcp_user_props_list, .gc = zcp_user_props_list_gc, .pargs = { { .za_name = "filesystem | snapshot | volume", - .za_lua_type = LUA_TSTRING}, + .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -383,13 +383,13 @@ static zcp_list_info_t zcp_user_props_list_info = { * versions of ZFS, we declare 'properties' as an alias for * 'user_properties'. */ -static zcp_list_info_t zcp_props_list_info = { +static const zcp_list_info_t zcp_props_list_info = { .name = "properties", .func = zcp_user_props_list, .gc = zcp_user_props_list_gc, .pargs = { { .za_name = "filesystem | snapshot | volume", - .za_lua_type = LUA_TSTRING}, + .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -444,11 +444,11 @@ zcp_dataset_system_props(dsl_dataset_t *ds, nvlist_t *nv) } static int zcp_system_props_list(lua_State *); -static zcp_list_info_t zcp_system_props_list_info = { +static const zcp_list_info_t zcp_system_props_list_info = { .name = "system_properties", .func = zcp_system_props_list, .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -467,7 +467,7 @@ zcp_system_props_list(lua_State *state) char errbuf[128]; const char *dataset_name; dsl_pool_t *dp = zcp_run_info(state)->zri_pool; - zcp_list_info_t *libinfo = &zcp_system_props_list_info; + const zcp_list_info_t *libinfo = &zcp_system_props_list_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); dataset_name = lua_tostring(state, 1); nvlist_t *nv = fnvlist_alloc(); @@ -566,11 +566,11 @@ zcp_bookmarks_iter(lua_State *state) } static int zcp_bookmarks_list(lua_State *); -static zcp_list_info_t zcp_bookmarks_list_info = { +static const zcp_list_info_t zcp_bookmarks_list_info = { .name = "bookmarks", .func = zcp_bookmarks_list, .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -654,12 +654,12 @@ zcp_holds_iter(lua_State *state) } static int zcp_holds_list(lua_State *); -static zcp_list_info_t zcp_holds_list_info = { +static const zcp_list_info_t zcp_holds_list_info = { .name = "holds", .func = zcp_holds_list, .gc = NULL, .pargs = { - { .za_name = "snapshot", .za_lua_type = LUA_TSTRING}, + { .za_name = "snapshot", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -710,8 +710,7 @@ zcp_list_func(lua_State *state) int zcp_load_list_lib(lua_State *state) { - int i; - zcp_list_info_t *zcp_list_funcs[] = { + const zcp_list_info_t *zcp_list_funcs[] = { &zcp_children_list_info, &zcp_snapshots_list_info, &zcp_user_props_list_info, @@ -725,8 +724,8 @@ zcp_load_list_lib(lua_State *state) lua_newtable(state); - for (i = 0; zcp_list_funcs[i] != NULL; i++) { - zcp_list_info_t *info = zcp_list_funcs[i]; + for (int i = 0; zcp_list_funcs[i] != NULL; i++) { + const zcp_list_info_t *info = zcp_list_funcs[i]; if (info->gc != NULL) { /* @@ -741,10 +740,9 @@ zcp_load_list_lib(lua_State *state) lua_pop(state, 1); } - lua_pushlightuserdata(state, info); + lua_pushlightuserdata(state, (void *)(uintptr_t)info); lua_pushcclosure(state, &zcp_list_func, 1); lua_setfield(state, -2, info->name); - info++; } return (1); diff --git a/sys/contrib/openzfs/module/zfs/zcp_synctask.c b/sys/contrib/openzfs/module/zfs/zcp_synctask.c index c6ade59b9ced..058910054d97 100644 --- a/sys/contrib/openzfs/module/zfs/zcp_synctask.c +++ b/sys/contrib/openzfs/module/zfs/zcp_synctask.c @@ -114,25 +114,25 @@ zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc, static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *); -static zcp_synctask_info_t zcp_synctask_destroy_info = { +static const zcp_synctask_info_t zcp_synctask_destroy_info = { .name = "destroy", .func = zcp_synctask_destroy, .pargs = { - {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING}, + {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { - {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN}, + {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN }, {NULL, 0} }, .space_check = ZFS_SPACE_CHECK_DESTROY, .blocks_modified = 0 }; -/* ARGSUSED */ static int zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details) { + (void) err_details; int err; const char *dsname = lua_tostring(state, 1); @@ -167,11 +167,11 @@ zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details) } static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *); -static zcp_synctask_info_t zcp_synctask_promote_info = { +static const zcp_synctask_info_t zcp_synctask_promote_info = { .name = "promote", .func = zcp_synctask_promote, .pargs = { - {.za_name = "clone", .za_lua_type = LUA_TSTRING}, + {.za_name = "clone", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -205,13 +205,13 @@ zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details) } static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details); -static zcp_synctask_info_t zcp_synctask_rollback_info = { +static const zcp_synctask_info_t zcp_synctask_rollback_info = { .name = "rollback", .func = zcp_synctask_rollback, .space_check = ZFS_SPACE_CHECK_RESERVED, .blocks_modified = 1, .pargs = { - {.za_name = "filesystem", .za_lua_type = LUA_TSTRING}, + {.za_name = "filesystem", .za_lua_type = LUA_TSTRING }, {0, 0} }, .kwargs = { @@ -236,12 +236,12 @@ zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details) } static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *); -static zcp_synctask_info_t zcp_synctask_snapshot_info = { +static const zcp_synctask_info_t zcp_synctask_snapshot_info = { .name = "snapshot", .func = zcp_synctask_snapshot, .pargs = { {.za_name = "filesystem@snapname | volume@snapname", - .za_lua_type = LUA_TSTRING}, + .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -251,10 +251,10 @@ static zcp_synctask_info_t zcp_synctask_snapshot_info = { .blocks_modified = 3 }; -/* ARGSUSED */ static int zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) { + (void) err_details; int err; dsl_dataset_snapshot_arg_t ddsa = { 0 }; const char *dsname = lua_tostring(state, 1); @@ -302,9 +302,45 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) return (err); } +static int zcp_synctask_rename_snapshot(lua_State *, boolean_t, nvlist_t *); +static const zcp_synctask_info_t zcp_synctask_rename_snapshot_info = { + .name = "rename_snapshot", + .func = zcp_synctask_rename_snapshot, + .pargs = { + {.za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING }, + {.za_name = "oldsnapname", .za_lua_type = LUA_TSTRING }, + {.za_name = "newsnapname", .za_lua_type = LUA_TSTRING }, + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 1 +}; + +static int +zcp_synctask_rename_snapshot(lua_State *state, boolean_t sync, + nvlist_t *err_details) +{ + (void) err_details; + int err; + const char *fsname = lua_tostring(state, 1); + const char *oldsnapname = lua_tostring(state, 2); + const char *newsnapname = lua_tostring(state, 3); + + struct dsl_dataset_rename_snapshot_arg ddrsa = { 0 }; + ddrsa.ddrsa_fsname = fsname; + ddrsa.ddrsa_oldsnapname = oldsnapname; + ddrsa.ddrsa_newsnapname = newsnapname; + ddrsa.ddrsa_recursive = B_FALSE; + + err = zcp_sync_task(state, dsl_dataset_rename_snapshot_check, + dsl_dataset_rename_snapshot_sync, &ddrsa, sync, NULL); + + return (err); +} + static int zcp_synctask_inherit_prop(lua_State *, boolean_t, nvlist_t *err_details); -static zcp_synctask_info_t zcp_synctask_inherit_prop_info = { +static const zcp_synctask_info_t zcp_synctask_inherit_prop_info = { .name = "inherit", .func = zcp_synctask_inherit_prop, .space_check = ZFS_SPACE_CHECK_RESERVED, @@ -325,7 +361,7 @@ zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx) zcp_inherit_prop_arg_t *args = arg; zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop); - if (prop == ZPROP_INVAL) { + if (prop == ZPROP_USERPROP) { if (zfs_prop_user(args->zipa_prop)) return (0); @@ -354,6 +390,7 @@ static int zcp_synctask_inherit_prop(lua_State *state, boolean_t sync, nvlist_t *err_details) { + (void) err_details; int err; zcp_inherit_prop_arg_t zipa = { 0 }; dsl_props_set_arg_t *dpsa = &zipa.zipa_dpsa; @@ -381,12 +418,12 @@ zcp_synctask_inherit_prop(lua_State *state, boolean_t sync, } static int zcp_synctask_bookmark(lua_State *, boolean_t, nvlist_t *); -static zcp_synctask_info_t zcp_synctask_bookmark_info = { +static const zcp_synctask_info_t zcp_synctask_bookmark_info = { .name = "bookmark", .func = zcp_synctask_bookmark, .pargs = { - {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING}, - {.za_name = "bookmark", .za_lua_type = LUA_TSTRING}, + {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING }, + {.za_name = "bookmark", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { @@ -396,10 +433,10 @@ static zcp_synctask_info_t zcp_synctask_bookmark_info = { .blocks_modified = 1, }; -/* ARGSUSED */ static int zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details) { + (void) err_details; int err; const char *source = lua_tostring(state, 1); const char *new = lua_tostring(state, 2); @@ -424,15 +461,15 @@ zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details) } static int zcp_synctask_set_prop(lua_State *, boolean_t, nvlist_t *err_details); -static zcp_synctask_info_t zcp_synctask_set_prop_info = { +static const zcp_synctask_info_t zcp_synctask_set_prop_info = { .name = "set_prop", .func = zcp_synctask_set_prop, .space_check = ZFS_SPACE_CHECK_RESERVED, .blocks_modified = 2, .pargs = { - { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, - { .za_name = "property", .za_lua_type = LUA_TSTRING}, - { .za_name = "value", .za_lua_type = LUA_TSTRING}, + { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, + { .za_name = "property", .za_lua_type = LUA_TSTRING }, + { .za_name = "value", .za_lua_type = LUA_TSTRING }, { NULL, 0 } }, .kwargs = { @@ -443,6 +480,7 @@ static zcp_synctask_info_t zcp_synctask_set_prop_info = { static int zcp_synctask_set_prop(lua_State *state, boolean_t sync, nvlist_t *err_details) { + (void) err_details; int err; zcp_set_prop_arg_t args = { 0 }; @@ -522,12 +560,12 @@ zcp_synctask_wrapper(lua_State *state) int zcp_load_synctask_lib(lua_State *state, boolean_t sync) { - int i; - zcp_synctask_info_t *zcp_synctask_funcs[] = { + const zcp_synctask_info_t *zcp_synctask_funcs[] = { &zcp_synctask_destroy_info, &zcp_synctask_promote_info, &zcp_synctask_rollback_info, &zcp_synctask_snapshot_info, + &zcp_synctask_rename_snapshot_info, &zcp_synctask_inherit_prop_info, &zcp_synctask_bookmark_info, &zcp_synctask_set_prop_info, @@ -536,13 +574,12 @@ zcp_load_synctask_lib(lua_State *state, boolean_t sync) lua_newtable(state); - for (i = 0; zcp_synctask_funcs[i] != NULL; i++) { - zcp_synctask_info_t *info = zcp_synctask_funcs[i]; - lua_pushlightuserdata(state, info); + for (int i = 0; zcp_synctask_funcs[i] != NULL; i++) { + const zcp_synctask_info_t *info = zcp_synctask_funcs[i]; + lua_pushlightuserdata(state, (void *)(uintptr_t)info); lua_pushboolean(state, sync); lua_pushcclosure(state, &zcp_synctask_wrapper, 2); lua_setfield(state, -2, info->name); - info++; } return (1); diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c index 9d16fff81d0a..1d25bc406866 100644 --- a/sys/contrib/openzfs/module/zfs/zfeature.c +++ b/sys/contrib/openzfs/module/zfs/zfeature.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -389,6 +389,13 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) && feature->fi_feature == SPA_FEATURE_BOOKMARK_V2) spa->spa_errata = 0; + + /* + * Convert the old on-disk error log to the new format when activating + * the head_errlog feature. + */ + if (feature->fi_feature == SPA_FEATURE_HEAD_ERRLOG) + spa_upgrade_errlog(spa, tx); } static void diff --git a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c index cd35849c3f37..8666883f09a2 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c +++ b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -36,9 +36,7 @@ static void zfs_oldace_byteswap(ace_t *ace, int ace_cnt) { - int i; - - for (i = 0; i != ace_cnt; i++, ace++) { + for (int i = 0; i != ace_cnt; i++, ace++) { ace->a_who = BSWAP_32(ace->a_who); ace->a_access_mask = BSWAP_32(ace->a_access_mask); ace->a_flags = BSWAP_16(ace->a_flags); @@ -138,23 +136,16 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) } } -/* ARGSUSED */ void zfs_oldacl_byteswap(void *buf, size_t size) { - int cnt; - /* * Arggh, since we don't know how many ACEs are in * the array, we have to swap the entire block */ - - cnt = size / sizeof (ace_t); - - zfs_oldace_byteswap((ace_t *)buf, cnt); + zfs_oldace_byteswap((ace_t *)buf, size / sizeof (ace_t)); } -/* ARGSUSED */ void zfs_acl_byteswap(void *buf, size_t size) { diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c new file mode 100644 index 000000000000..acedeab7a163 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c @@ -0,0 +1,379 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> + */ + +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/zfs_chksum.h> +#include <sys/zfs_impl.h> + +#include <sys/blake3.h> +#include <sys/sha2.h> + +/* limit benchmarking to max 256KiB, when EdonR is slower then this: */ +#define LIMIT_PERF_MBS 300 + +typedef struct { + const char *name; + const char *impl; + uint64_t bs1k; + uint64_t bs4k; + uint64_t bs16k; + uint64_t bs64k; + uint64_t bs256k; + uint64_t bs1m; + uint64_t bs4m; + uint64_t bs16m; + zio_cksum_salt_t salt; + zio_checksum_t *(func); + zio_checksum_tmpl_init_t *(init); + zio_checksum_tmpl_free_t *(free); +} chksum_stat_t; + +static chksum_stat_t *chksum_stat_data = 0; +static int chksum_stat_cnt = 0; +static kstat_t *chksum_kstat = NULL; + +/* + * Sample output on i3-1005G1 System: + * + * implementation 1k 4k 16k 64k 256k 1m 4m 16m + * edonr-generic 1278 1625 1769 1776 1783 1778 1771 1767 + * skein-generic 548 594 613 623 621 623 621 486 + * sha256-generic 255 270 281 278 279 281 283 283 + * sha256-x64 288 310 316 317 318 317 317 316 + * sha256-ssse3 304 342 351 355 356 357 356 356 + * sha256-avx 311 348 359 362 362 363 363 362 + * sha256-avx2 330 378 389 395 395 395 395 395 + * sha256-shani 908 1127 1212 1230 1233 1234 1223 1230 + * sha512-generic 359 409 431 427 429 430 428 423 + * sha512-x64 420 473 490 496 497 497 496 495 + * sha512-avx 406 522 546 560 560 560 556 560 + * sha512-avx2 464 568 601 606 609 610 607 608 + * blake3-generic 330 327 324 323 324 320 323 322 + * blake3-sse2 424 1366 1449 1468 1458 1453 1395 1408 + * blake3-sse41 453 1554 1658 1703 1689 1669 1622 1630 + * blake3-avx2 452 2013 3225 3351 3356 3261 3076 3101 + * blake3-avx512 498 2869 5269 5926 5872 5643 5014 5005 + */ +static int +chksum_kstat_headers(char *buf, size_t size) +{ + ssize_t off = 0; + + off += kmem_scnprintf(buf + off, size, "%-23s", "implementation"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "1k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "4k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "16k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "64k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "256k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "1m"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "4m"); + (void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m"); + + return (0); +} + +static int +chksum_kstat_data(char *buf, size_t size, void *data) +{ + chksum_stat_t *cs; + ssize_t off = 0; + char b[24]; + + cs = (chksum_stat_t *)data; + kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl); + off += kmem_scnprintf(buf + off, size - off, "%-23s", b); + off += kmem_scnprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs1k); + off += kmem_scnprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs4k); + off += kmem_scnprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs16k); + off += kmem_scnprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs64k); + off += kmem_scnprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs256k); + off += kmem_scnprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs1m); + off += kmem_scnprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs4m); + (void) kmem_scnprintf(buf + off, size - off, "%8llu\n", + (u_longlong_t)cs->bs16m); + + return (0); +} + +static void * +chksum_kstat_addr(kstat_t *ksp, loff_t n) +{ + if (n < chksum_stat_cnt) + ksp->ks_private = (void *)(chksum_stat_data + n); + else + ksp->ks_private = NULL; + + return (ksp->ks_private); +} + +static void +chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, + uint64_t *result) +{ + hrtime_t start; + uint64_t run_bw, run_time_ns, run_count = 0, size = 0; + uint32_t l, loops = 0; + zio_cksum_t zcp; + + switch (round) { + case 1: /* 1k */ + size = 1<<10; loops = 128; break; + case 2: /* 2k */ + size = 1<<12; loops = 64; break; + case 3: /* 4k */ + size = 1<<14; loops = 32; break; + case 4: /* 16k */ + size = 1<<16; loops = 16; break; + case 5: /* 256k */ + size = 1<<18; loops = 8; break; + case 6: /* 1m */ + size = 1<<20; loops = 4; break; + case 7: /* 4m */ + size = 1<<22; loops = 1; break; + case 8: /* 16m */ + size = 1<<24; loops = 1; break; + } + + kpreempt_disable(); + start = gethrtime(); + do { + for (l = 0; l < loops; l++, run_count++) + cs->func(abd, size, ctx, &zcp); + + run_time_ns = gethrtime() - start; + } while (run_time_ns < MSEC2NSEC(1)); + kpreempt_enable(); + + run_bw = size * run_count * NANOSEC; + run_bw /= run_time_ns; /* B/s */ + *result = run_bw/1024/1024; /* MiB/s */ +} + +#define LIMIT_INIT 0 +#define LIMIT_NEEDED 1 +#define LIMIT_NOLIMIT 2 + +static void +chksum_benchit(chksum_stat_t *cs) +{ + abd_t *abd; + void *ctx = 0; + void *salt = &cs->salt.zcs_bytes; + static int chksum_stat_limit = LIMIT_INIT; + + memset(salt, 0, sizeof (cs->salt.zcs_bytes)); + if (cs->init) + ctx = cs->init(&cs->salt); + + /* allocate test memory via abd linear interface */ + abd = abd_alloc_linear(1<<20, B_FALSE); + chksum_run(cs, abd, ctx, 1, &cs->bs1k); + chksum_run(cs, abd, ctx, 2, &cs->bs4k); + chksum_run(cs, abd, ctx, 3, &cs->bs16k); + chksum_run(cs, abd, ctx, 4, &cs->bs64k); + chksum_run(cs, abd, ctx, 5, &cs->bs256k); + + /* check if we ran on a slow cpu */ + if (chksum_stat_limit == LIMIT_INIT) { + if (cs->bs1k < LIMIT_PERF_MBS) { + chksum_stat_limit = LIMIT_NEEDED; + } else { + chksum_stat_limit = LIMIT_NOLIMIT; + } + } + + /* skip benchmarks >= 1MiB when the CPU is to slow */ + if (chksum_stat_limit == LIMIT_NEEDED) + goto abort; + + chksum_run(cs, abd, ctx, 6, &cs->bs1m); + abd_free(abd); + + /* allocate test memory via abd non linear interface */ + abd = abd_alloc(1<<24, B_FALSE); + chksum_run(cs, abd, ctx, 7, &cs->bs4m); + chksum_run(cs, abd, ctx, 8, &cs->bs16m); + +abort: + abd_free(abd); + + /* free up temp memory */ + if (cs->free) + cs->free(ctx); +} + +/* + * Initialize and benchmark all supported implementations. + */ +static void +chksum_benchmark(void) +{ +#ifndef _KERNEL + /* we need the benchmark only for the kernel module */ + return; +#endif + + chksum_stat_t *cs; + uint64_t max; + uint32_t id, cbid = 0, id_save; + const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3"); + const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); + const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + + /* count implementations */ + chksum_stat_cnt = 2; + chksum_stat_cnt += sha256->getcnt(); + chksum_stat_cnt += sha512->getcnt(); + chksum_stat_cnt += blake3->getcnt(); + chksum_stat_data = kmem_zalloc( + sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); + + /* edonr - needs to be the first one here (slow CPU check) */ + cs = &chksum_stat_data[cbid++]; + + /* edonr */ + cs->init = abd_checksum_edonr_tmpl_init; + cs->func = abd_checksum_edonr_native; + cs->free = abd_checksum_edonr_tmpl_free; + cs->name = "edonr"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* skein */ + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_skein_tmpl_init; + cs->func = abd_checksum_skein_native; + cs->free = abd_checksum_skein_tmpl_free; + cs->name = "skein"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* sha256 */ + id_save = sha256->getid(); + for (max = 0, id = 0; id < sha256->getcnt(); id++) { + sha256->setid(id); + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_sha256; + cs->free = 0; + cs->name = sha256->name; + cs->impl = sha256->getname(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + sha256->set_fastest(id); + } + } + sha256->setid(id_save); + + /* sha512 */ + id_save = sha512->getid(); + for (max = 0, id = 0; id < sha512->getcnt(); id++) { + sha512->setid(id); + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_sha512_native; + cs->free = 0; + cs->name = sha512->name; + cs->impl = sha512->getname(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + sha512->set_fastest(id); + } + } + sha512->setid(id_save); + + /* blake3 */ + id_save = blake3->getid(); + for (max = 0, id = 0; id < blake3->getcnt(); id++) { + blake3->setid(id); + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_blake3_tmpl_init; + cs->func = abd_checksum_blake3_native; + cs->free = abd_checksum_blake3_tmpl_free; + cs->name = blake3->name; + cs->impl = blake3->getname(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + blake3->set_fastest(id); + } + } + blake3->setid(id_save); +} + +void +chksum_init(void) +{ +#ifdef _KERNEL + blake3_per_cpu_ctx_init(); +#endif + + /* Benchmark supported implementations */ + chksum_benchmark(); + + /* Install kstats for all implementations */ + chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + if (chksum_kstat != NULL) { + chksum_kstat->ks_data = NULL; + chksum_kstat->ks_ndata = UINT32_MAX; + kstat_set_raw_ops(chksum_kstat, + chksum_kstat_headers, + chksum_kstat_data, + chksum_kstat_addr); + kstat_install(chksum_kstat); + } +} + +void +chksum_fini(void) +{ + if (chksum_kstat != NULL) { + kstat_delete(chksum_kstat); + chksum_kstat = NULL; + } + + if (chksum_stat_cnt) { + kmem_free(chksum_stat_data, + sizeof (chksum_stat_t) * chksum_stat_cnt); + chksum_stat_cnt = 0; + chksum_stat_data = 0; + } + +#ifdef _KERNEL + blake3_per_cpu_ctx_fini(); +#endif +} diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index 007f31b4e7b3..2f43c4aa41b8 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -59,7 +59,7 @@ * read I/Os, there are basically three 'types' of I/O, which form a roughly * layered diagram: * - * +---------------+ + * +---------------+ * | Aggregate I/O | No associated logical data or device * +---------------+ * | @@ -124,14 +124,14 @@ static taskqid_t recent_events_cleaner_tqid; * This setting can be changed dynamically and setting it to zero * disables duplicate detection. */ -unsigned int zfs_zevent_retain_max = 2000; +static unsigned int zfs_zevent_retain_max = 2000; /* * The lifespan for a recent ereport entry. The default of 15 minutes is * intended to outlive the zfs diagnosis engine's threshold of 10 errors * over a period of 10 minutes. */ -unsigned int zfs_zevent_retain_expire_secs = 900; +static unsigned int zfs_zevent_retain_expire_secs = 900; typedef enum zfs_subclass { ZSC_IO, @@ -200,12 +200,53 @@ recent_events_compare(const void *a, const void *b) return (0); } +/* + * workaround: vdev properties don't have inheritance + */ +static uint64_t +vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) +{ + uint64_t propdef, propval; + + propdef = vdev_prop_default_numeric(prop); + switch (prop) { + case VDEV_PROP_CHECKSUM_N: + propval = vd->vdev_checksum_n; + break; + case VDEV_PROP_CHECKSUM_T: + propval = vd->vdev_checksum_t; + break; + case VDEV_PROP_IO_N: + propval = vd->vdev_io_n; + break; + case VDEV_PROP_IO_T: + propval = vd->vdev_io_t; + break; + case VDEV_PROP_SLOW_IO_N: + propval = vd->vdev_slow_io_n; + break; + case VDEV_PROP_SLOW_IO_T: + propval = vd->vdev_slow_io_t; + break; + default: + propval = propdef; + break; + } + + if (propval != propdef) + return (propval); + + if (vd->vdev_parent == NULL) + return (propdef); + + return (vdev_prop_get_inherited(vd->vdev_parent, prop)); +} + static void zfs_ereport_schedule_cleaner(void); /* * background task to clean stale recent event nodes. */ -/*ARGSUSED*/ static void zfs_ereport_cleaner(void *arg) { @@ -254,7 +295,6 @@ void zfs_ereport_clear(spa_t *spa, vdev_t *vd) { uint64_t vdev_guid, pool_guid; - int cnt = 0; ASSERT(vd != NULL || spa != NULL); if (vd == NULL) { @@ -278,7 +318,6 @@ zfs_ereport_clear(spa_t *spa, vdev_t *vd) avl_remove(&recent_events_tree, entry); list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); - cnt++; } } @@ -665,6 +704,69 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, zb->zb_blkid, NULL); } + /* + * Payload for tuning the zed + */ + if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { + uint64_t cksum_n, cksum_t; + + cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N); + if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, + DATA_TYPE_UINT64, + cksum_n, + NULL); + + cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T); + if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, + DATA_TYPE_UINT64, + cksum_t, + NULL); + } + + if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) { + uint64_t io_n, io_t; + + io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N); + if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, + DATA_TYPE_UINT64, + io_n, + NULL); + + io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T); + if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, + DATA_TYPE_UINT64, + io_t, + NULL); + } + + if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { + uint64_t slow_io_n, slow_io_t; + + slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); + if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + DATA_TYPE_UINT64, + slow_io_n, + NULL); + + slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); + if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + DATA_TYPE_UINT64, + slow_io_t, + NULL); + } + mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; @@ -678,10 +780,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, #define MAX_RANGES 16 typedef struct zfs_ecksum_info { - /* histograms of set and cleared bits by bit number in a 64-bit word */ - uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; - uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; - /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; @@ -705,7 +803,7 @@ typedef struct zfs_ecksum_info { } zfs_ecksum_info_t; static void -update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) +update_bad_bits(uint64_t value_arg, uint32_t *count) { size_t i; size_t bits = 0; @@ -713,10 +811,8 @@ update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) /* We store the bits in big-endian (largest-first) order */ for (i = 0; i < 64; i++) { - if (value & (1ull << i)) { - hist[63 - i]++; + if (value & (1ull << i)) ++bits; - } } /* update the count of bits changed */ *count += bits; @@ -826,9 +922,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, const uint64_t *good; const uint64_t *bad; - uint64_t allset = 0; - uint64_t allcleared = 0; - size_t nui64s = size / sizeof (uint64_t); size_t inline_size; @@ -847,14 +940,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, if (info != NULL && info->zbc_has_cksum) { fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_expected) / sizeof (uint64_t), - (uint64_t *)&info->zbc_expected, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_actual) / sizeof (uint64_t), - (uint64_t *)&info->zbc_actual, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, DATA_TYPE_STRING, info->zbc_checksum_name, @@ -930,9 +1015,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, // bits set in good, but not in bad cleared = (good[idx] & (~bad[idx])); - allset |= set; - allcleared |= cleared; - if (!no_inline) { ASSERT3U(offset, <, inline_size); eip->zei_bits_set[offset] = set; @@ -940,10 +1022,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, offset++; } - update_histogram(set, eip->zei_histogram_set, - &eip->zei_range_sets[range]); - update_histogram(cleared, eip->zei_histogram_cleared, - &eip->zei_range_clears[range]); + update_bad_bits(set, &eip->zei_range_sets[range]); + update_bad_bits(cleared, &eip->zei_range_clears[range]); } /* convert to byte offsets */ @@ -979,23 +1059,14 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_cleared, NULL); - } else { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, - DATA_TYPE_UINT32_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_set, - FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, - DATA_TYPE_UINT32_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, - NULL); } return (eip); } #else -/*ARGSUSED*/ void zfs_ereport_clear(spa_t *spa, vdev_t *vd) { + (void) spa, (void) vd; } #endif @@ -1025,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) return (B_FALSE); if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ + /* If this is not a read or write zio, ignore the error */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) return (B_FALSE); @@ -1072,6 +1140,8 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) (zio != NULL) && (!zio->io_timestamp)) { return (B_FALSE); } +#else + (void) subclass, (void) spa, (void) vd, (void) zio; #endif return (B_TRUE); } @@ -1112,6 +1182,9 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, /* Cleanup is handled by the callback function */ rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); +#else + (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio, + (void) state; #endif return (rc); } @@ -1141,6 +1214,8 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return (SET_ERROR(EBUSY)); +#else + (void) zb, (void) offset; #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); @@ -1150,7 +1225,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, /* copy the checksum failure information if it was provided */ if (info != NULL) { report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); - bcopy(info, report->zcr_ckinfo, sizeof (*info)); + memcpy(report->zcr_ckinfo, info, sizeof (*info)); } report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; @@ -1193,6 +1268,9 @@ zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, report->zcr_ereport = report->zcr_detector = NULL; if (info != NULL) kmem_free(info, sizeof (*info)); +#else + (void) report, (void) good_data, (void) bad_data, + (void) drop_if_identical; #endif } @@ -1257,6 +1335,9 @@ zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } +#else + (void) spa, (void) vd, (void) zb, (void) zio, (void) offset, + (void) length, (void) good_data, (void) bad_data, (void) zbc; #endif return (rc); } @@ -1321,7 +1402,8 @@ zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) (void) nvlist_add_nvpair(resource, elem); } - +#else + (void) spa, (void) vd, (void) type, (void) name, (void) aux; #endif return (resource); } @@ -1336,6 +1418,8 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, resource = zfs_event_create(spa, vd, type, name, aux); if (resource) zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); +#else + (void) spa, (void) vd, (void) type, (void) name, (void) aux; #endif } @@ -1380,17 +1464,17 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) aux = fm_nvlist_create(NULL); if (vd && aux) { if (vd->vdev_physpath) { - (void) nvlist_add_string(aux, + fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, vd->vdev_physpath); } if (vd->vdev_enc_sysfs_path) { - (void) nvlist_add_string(aux, + fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); } - (void) nvlist_add_uint64(aux, + fnvlist_add_uint64(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); } @@ -1399,6 +1483,8 @@ zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) if (aux) fm_nvlist_destroy(aux, FM_NVA_FREE); +#else + (void) spa, (void) vd, (void) laststate; #endif } @@ -1434,9 +1520,8 @@ zfs_ereport_fini(void) { recent_events_node_t *entry; - while ((entry = list_head(&recent_events_list)) != NULL) { + while ((entry = list_remove_head(&recent_events_list)) != NULL) { avl_remove(&recent_events_tree, entry); - list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } avl_destroy(&recent_events_tree); @@ -1450,7 +1535,7 @@ zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) nvlist_t *aux; aux = fm_nvlist_create(NULL); - nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); + fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); fm_nvlist_destroy(aux, FM_NVA_FREE); @@ -1485,12 +1570,12 @@ zfs_ereport_zvol_post(const char *subclass, const char *name, return; aux = fm_nvlist_create(NULL); - nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); - nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, + fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); + fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, raw_name); r = strchr(name, '/'); if (r && r[1]) - nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); + fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); fm_nvlist_destroy(aux, FM_NVA_FREE); diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c index a90bf5feeea1..add4241dcc99 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -61,7 +61,7 @@ typedef struct fuid_domain { uint64_t f_idx; } fuid_domain_t; -static char *nulldomain = ""; +static const char *const nulldomain = ""; /* * Compare two indexes. @@ -133,7 +133,7 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, for (i = 0; i != count; i++) { fuid_domain_t *domnode; - char *domain; + const char *domain; uint64_t idx; VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, @@ -171,7 +171,7 @@ zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) avl_destroy(idx_tree); } -char * +const char * zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) { fuid_domain_t searchnode, *findnode; @@ -258,8 +258,8 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, domnode->f_ksid->kd_name) == 0); } - VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, - fuids, numnodes) == 0); + fnvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + (const nvlist_t * const *)fuids, numnodes); for (i = 0; i != numnodes; i++) nvlist_free(fuids[i]); kmem_free(fuids, numnodes * sizeof (void *)); @@ -290,9 +290,9 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) * necessary for the caller or another thread to detect the dirty table * and sync out the changes. */ -int +static int zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, - char **retdomain, boolean_t addok) + const char **retdomain, boolean_t addok) { fuid_domain_t searchnode, *findnode; avl_index_t loc; @@ -358,7 +358,7 @@ retry: const char * zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) { - char *domain; + const char *domain; if (idx == 0 || !zfsvfs->z_use_fuids) return (NULL); @@ -518,8 +518,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, uint64_t idx; ksid_t *ksid; uint32_t rid; - char *kdomain; - const char *domain; + const char *kdomain, *domain; uid_t id; VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); @@ -574,8 +573,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) { #ifdef HAVE_KSID - const char *domain; - char *kdomain; + const char *domain, *kdomain; uint32_t fuid_idx = FUID_INDEX(id); uint32_t rid = 0; idmap_stat status; @@ -624,7 +622,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, rid = FUID_RID(fuidp->z_fuid_group); idx = FUID_INDEX(fuidp->z_fuid_group); break; - }; + } domain = fuidp->z_domain_table[idx - 1]; } else { if (type == ZFS_OWNER || type == ZFS_ACE_USER) @@ -701,19 +699,15 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp) zfs_fuid_t *zfuid; zfs_fuid_domain_t *zdomain; - while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { - list_remove(&fuidp->z_fuids, zfuid); + while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL) kmem_free(zfuid, sizeof (zfs_fuid_t)); - } if (fuidp->z_domain_table != NULL) kmem_free(fuidp->z_domain_table, (sizeof (char *)) * fuidp->z_domain_cnt); - while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { - list_remove(&fuidp->z_domains, zdomain); + while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL) kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); - } kmem_free(fuidp, sizeof (zfs_fuid_info_t)); } diff --git a/sys/contrib/openzfs/module/zfs/spa_boot.c b/sys/contrib/openzfs/module/zfs/zfs_impl.c index 674394650f82..20322ff98b31 100644 --- a/sys/contrib/openzfs/module/zfs/spa_boot.c +++ b/sys/contrib/openzfs/module/zfs/zfs_impl.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,31 +20,42 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de> */ -#ifdef _KERNEL +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/zfs_impl.h> -#include <sys/zio.h> -#include <sys/spa_boot.h> -#include <sys/sunddi.h> +#include <sys/blake3.h> +#include <sys/sha2.h> -char * -spa_get_bootprop(char *propname) +/* + * impl_ops - backend for implementations of algorithms + */ +const zfs_impl_t *impl_ops[] = { + &zfs_blake3_ops, + &zfs_sha256_ops, + &zfs_sha512_ops, + NULL +}; + +/* + * zfs_impl_get_ops - Get the API functions for an impl backend + */ +const zfs_impl_t * +zfs_impl_get_ops(const char *algo) { - char *value; + const zfs_impl_t **ops = impl_ops; - if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), - DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS) - return (NULL); - return (value); -} + if (!algo || !*algo) + return (*ops); -void -spa_free_bootprop(char *value) -{ - ddi_prop_free(value); -} + for (; *ops; ops++) { + if (strcmp(algo, (*ops)->name) == 0) + break; + } -#endif /* _KERNEL */ + ASSERT3P(ops, !=, NULL); + return (*ops); +} diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 96a021acbc95..7b527eb75e83 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -23,11 +23,11 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. - * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net> + * Copyright (c) 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -38,8 +38,9 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2021, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright 2024 Oxide Computer Company */ /* @@ -222,23 +223,22 @@ #include <sys/zfs_ioctl_impl.h> kmutex_t zfsdev_state_lock; -zfsdev_state_t *zfsdev_state_list; +static zfsdev_state_t zfsdev_state_listhead; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ -unsigned long zfs_max_nvlist_src_size = 0; +uint64_t zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ -unsigned long zfs_history_output_max = 1024 * 1024; +static uint64_t zfs_history_output_max = 1024 * 1024; -uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ @@ -373,10 +373,10 @@ zfs_log_history(zfs_cmd_t *zc) * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ -/* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc, (void) innvl, (void) cr; return (0); } @@ -384,10 +384,10 @@ zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ -/* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl, (void) cr; if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); @@ -604,7 +604,7 @@ static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { - char *strval; + const char *strval; /* * Check permissions for special properties. @@ -656,35 +656,29 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } -/* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - int error; - - error = zfs_dozonecheck(zc->zc_name, cr); - if (error != 0) - return (error); - /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ - return (0); + (void) innvl; + return (zfs_dozonecheck(zc->zc_name, cr)); } -/* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } -/* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; @@ -717,10 +711,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (error); } -/* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } @@ -728,12 +722,14 @@ zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc, (void) innvl, (void) cr; return (SET_ERROR(ENOTSUP)); } @@ -745,7 +741,7 @@ zfs_get_parent(const char *datasetname, char *parent, int parentsize) /* * Remove the @bla or /bla from the end of the name to get the parent. */ - (void) strncpy(parent, datasetname, parentsize); + (void) strlcpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; @@ -771,10 +767,10 @@ zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } -/* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } @@ -782,10 +778,10 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ -/* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc; nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; @@ -844,17 +840,17 @@ zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) return (error); } -/* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } -/* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; dsl_pool_t *dp; dsl_dataset_t *clone; int error; @@ -899,10 +895,10 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (error); } -/* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, @@ -917,13 +913,6 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) ZFS_DELEG_PERM_CREATE, cr)); } -/* ARGSUSED */ -static int -zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) -{ - return (zfs_secpolicy_recv(zc, innvl, cr)); -} - int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { @@ -934,10 +923,10 @@ zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) /* * Check for permission to create each snapshot in the nvlist. */ -/* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc; nvlist_t *snaps; int error = 0; nvpair_t *pair; @@ -946,7 +935,7 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { - char *name = nvpair_name(pair); + char *name = (char *)nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { @@ -965,15 +954,15 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) /* * Check for permission to create each bookmark in the nvlist. */ -/* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc; int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - char *name = nvpair_name(pair); + char *name = (char *)nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { @@ -990,16 +979,16 @@ zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (error); } -/* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc; nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { - char *name = nvpair_name(pair); + char *name = (char *)nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); @@ -1031,10 +1020,10 @@ zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (error); } -/* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc, (void) innvl, (void) cr; /* * Even root must have a proper TSD so that we know what pool * to log to. @@ -1047,9 +1036,9 @@ zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - char parentname[ZFS_MAX_DATASET_NAME_LEN]; - int error; - char *origin; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + const char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) @@ -1072,10 +1061,11 @@ zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ -/* ARGSUSED */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc, (void) innvl; + if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); @@ -1085,13 +1075,13 @@ zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) /* * Policy for object to name lookups. */ -/* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; int error; - if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) + if (secpolicy_sys_config(cr, B_FALSE) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); @@ -1101,20 +1091,20 @@ zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) /* * Policy for fault injection. Requires all privileges. */ -/* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc, (void) innvl; return (secpolicy_zinject(cr)); } -/* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); - if (prop == ZPROP_INVAL) { + if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, @@ -1174,18 +1164,18 @@ zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) userquota_perms[zc->zc_objset_type], cr)); } -/* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) innvl; return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } -/* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc; nvpair_t *pair; nvlist_t *holds; int error; @@ -1206,10 +1196,10 @@ zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (0); } -/* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + (void) zc; nvpair_t *pair; int error; @@ -1240,8 +1230,8 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) */ int error; - if ((error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_DIFF, cr)) == 0) + if (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_DIFF, cr) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); @@ -1289,8 +1279,7 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) packed = vmem_alloc(size, KM_SLEEP); - if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, - iflag)) != 0) { + if (ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } @@ -1407,7 +1396,8 @@ getzfsvfs(const char *dsname, zfsvfs_t **zfvp) * which prevents all inode ops from running. */ static int -zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) +zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp, + boolean_t writer) { int error = 0; @@ -1432,7 +1422,7 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) } static void -zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) +zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag) { ZFS_TEARDOWN_EXIT(zfsvfs, tag); @@ -1470,7 +1460,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; - char *tname; + const char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); @@ -1592,8 +1582,9 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) nvlist_t *configs; int error; - if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (SET_ERROR(EEXIST)); + error = spa_all_configs(&zc->zc_cookie, &configs); + if (error) + return (error); error = put_nvlist(zc, configs); @@ -1695,6 +1686,47 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * poolname name of the pool + * scan_type scan func (pool_scan_func_t) + * scan_command scrub pause/resume flag (pool_scrub_cmd_t) + */ +static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { + {"scan_type", DATA_TYPE_UINT64, 0}, + {"scan_command", DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + uint64_t scan_type, scan_cmd; + + if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0) + return (SET_ERROR(EINVAL)); + + if (scan_cmd >= POOL_SCRUB_FLAGS_END) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(poolname, &spa, FTAG)) != 0) + return (error); + + if (scan_cmd == POOL_SCRUB_PAUSE) { + error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); + } else if (scan_type == POOL_SCAN_NONE) { + error = spa_scan_stop(spa); + } else { + error = spa_scan(spa, scan_type); + } + + spa_close(spa, FTAG); + return (error); +} + static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { @@ -1855,7 +1887,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { - error = spa_vdev_add(spa, config); + error = spa_vdev_add(spa, config, zc->zc_flags); nvlist_free(config); } spa_close(spa, FTAG); @@ -1921,6 +1953,10 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc) error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; + case VDEV_STATE_REMOVED: + error = vdev_remove_wanted(spa, zc->zc_guid); + break; + default: error = SET_ERROR(EINVAL); } @@ -2044,7 +2080,7 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) dmu_objset_fast_stat(os, &zc->zc_objset_stats); - if (zc->zc_nvlist_dst != 0 && + if (!zc->zc_simple && zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* @@ -2300,7 +2336,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) } while (error == 0) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { + if (issig()) { error = SET_ERROR(EINTR); break; } @@ -2331,6 +2367,7 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) } if (zc->zc_simple) { + dsl_dataset_fast_stat(ds, &zc->zc_objset_stats); dsl_dataset_rele(ds, FTAG); break; } @@ -2416,7 +2453,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, const char *strval = NULL; int err = -1; - if (prop == ZPROP_INVAL) { + if (prop == ZPROP_USERPROP) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); @@ -2486,11 +2523,27 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; - case ZFS_PROP_SNAPDEV: - err = zvol_set_snapdev(dsname, source, intval); + case ZFS_PROP_VOLTHREADING: + err = zvol_set_volthreading(dsname, intval); + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; break; + case ZFS_PROP_SNAPDEV: case ZFS_PROP_VOLMODE: - err = zvol_set_volmode(dsname, source, intval); + err = zvol_set_common(dsname, prop, source, intval); + break; + case ZFS_PROP_READONLY: + err = zvol_set_ro(dsname, intval); + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; break; case ZFS_PROP_VERSION: { @@ -2558,6 +2611,7 @@ zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvpair_t *pair; nvpair_t *propval; int rv = 0; + int err; uint64_t intval; const char *strval; boolean_t should_update_mount_cache = B_FALSE; @@ -2569,7 +2623,7 @@ retry: while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); - int err = 0; + err = 0; /* decode the property value */ propval = pair; @@ -2586,7 +2640,7 @@ retry: /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); - } else if (err == 0 && prop == ZPROP_INVAL) { + } else if (err == 0 && prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); @@ -2668,47 +2722,52 @@ retry: goto retry; } - if (!nvlist_empty(genericnvl) && - dsl_props_set(dsname, source, genericnvl) != 0) { - /* - * If this fails, we still want to set as many properties as we - * can, so try setting them individually. - */ - pair = NULL; - while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { - const char *propname = nvpair_name(pair); - int err = 0; - - propval = pair; - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - nvlist_t *attrs; - attrs = fnvpair_value_nvlist(pair); - propval = fnvlist_lookup_nvpair(attrs, - ZPROP_VALUE); - } + if (nvlist_empty(genericnvl)) + goto out; - if (nvpair_type(propval) == DATA_TYPE_STRING) { - strval = fnvpair_value_string(propval); - err = dsl_prop_set_string(dsname, propname, - source, strval); - } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { - err = dsl_prop_inherit(dsname, propname, - source); - } else { - intval = fnvpair_value_uint64(propval); - err = dsl_prop_set_int(dsname, propname, source, - intval); - } + /* + * Try to set them all in one batch. + */ + err = dsl_props_set(dsname, source, genericnvl); + if (err == 0) + goto out; - if (err != 0) { - if (errlist != NULL) { - fnvlist_add_int32(errlist, propname, - err); - } - rv = err; + /* + * If batching fails, we still want to set as many properties as we + * can, so try setting them individually. + */ + pair = NULL; + while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + attrs = fnvpair_value_nvlist(pair); + propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); + } + + if (nvpair_type(propval) == DATA_TYPE_STRING) { + strval = fnvpair_value_string(propval); + err = dsl_prop_set_string(dsname, propname, + source, strval); + } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { + err = dsl_prop_inherit(dsname, propname, source); + } else { + intval = fnvpair_value_uint64(propval); + err = dsl_prop_set_int(dsname, propname, source, + intval); + } + + if (err != 0) { + if (errlist != NULL) { + fnvlist_add_int32(errlist, propname, err); } + rv = err; } } + +out: if (should_update_mount_cache) zfs_ioctl_update_mount_cache(dsname); @@ -2856,11 +2915,11 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) * and reservation to the received or default values even though * they are not considered inheritable. */ - if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } - if (prop == ZPROP_INVAL) { + if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); @@ -2928,7 +2987,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { @@ -2982,6 +3041,96 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) } /* + * innvl: { + * "vdevprops_set_vdev" -> guid + * "vdevprops_set_props" -> { prop -> value } + * } + * + * outnvl: propname -> error code (int32) + */ +static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = { + {ZPOOL_VDEV_PROPS_SET_VDEV, DATA_TYPE_UINT64, 0}, + {ZPOOL_VDEV_PROPS_SET_PROPS, DATA_TYPE_NVLIST, 0} +}; + +static int +zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + vdev_t *vd; + uint64_t vdev_guid; + + /* Early validation */ + if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, + &vdev_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (outnvl == NULL) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(poolname, &spa, FTAG)) != 0) + return (error); + + ASSERT(spa_writeable(spa)); + + if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOENT)); + } + + error = vdev_prop_set(vd, innvl, outnvl); + + spa_close(spa, FTAG); + + return (error); +} + +/* + * innvl: { + * "vdevprops_get_vdev" -> guid + * (optional) "vdevprops_get_props" -> { propname -> propid } + * } + * + * outnvl: propname -> value + */ +static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = { + {ZPOOL_VDEV_PROPS_GET_VDEV, DATA_TYPE_UINT64, 0}, + {ZPOOL_VDEV_PROPS_GET_PROPS, DATA_TYPE_NVLIST, ZK_OPTIONAL} +}; + +static int +zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + vdev_t *vd; + uint64_t vdev_guid; + + /* Early validation */ + if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, + &vdev_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (outnvl == NULL) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(poolname, &spa, FTAG)) != 0) + return (error); + + if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOENT)); + } + + error = vdev_prop_get(vd, innvl, outnvl); + + spa_close(spa, FTAG); + + return (error); +} + +/* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions @@ -3002,7 +3151,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) /* * Verify nvlist is constructed correctly */ - if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { + if (zfs_deleg_verify_nvlist(fsaclnv) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } @@ -3052,7 +3201,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc) return (error); } -/* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { @@ -3414,11 +3562,11 @@ static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; -/* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ + (void) fsname, (void) innvl, (void) outnvl; return (0); } @@ -3506,10 +3654,10 @@ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; -/* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { + (void) unused, (void) outnvl; const char *message; char *poolname; spa_t *spa; @@ -3612,10 +3760,10 @@ zfs_unmount_snap(const char *snapname) (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } -/* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { + (void) arg; zfs_unmount_snap(snapname); return (0); } @@ -3659,7 +3807,6 @@ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; -/* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { @@ -3712,10 +3859,10 @@ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"<bookmark>...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; -/* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) poolname; return (dsl_bookmark_create(innvl, outnvl)); } @@ -3752,11 +3899,11 @@ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; -/* ARGSUSED */ static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { + (void) innvl; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; @@ -3827,7 +3974,7 @@ static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { - char *program; + const char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; @@ -3861,10 +4008,10 @@ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; -/* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) innvl, (void) outnvl; return (spa_checkpoint(poolname)); } @@ -3876,11 +4023,11 @@ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; -/* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) innvl, (void) outnvl; return (spa_checkpoint_discard(poolname)); } @@ -3981,7 +4128,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || - cmd_type == POOL_INITIALIZE_SUSPEND)) { + cmd_type == POOL_INITIALIZE_SUSPEND || + cmd_type == POOL_INITIALIZE_UNINIT)) { return (SET_ERROR(EINVAL)); } @@ -4242,13 +4390,12 @@ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; -/* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; - char *target = NULL; + const char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); @@ -4314,16 +4461,17 @@ recursive_unmount(const char *fsname, void *arg) * outnvl is unused */ -/* ARGSUSED */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; + static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) outnvl; nvlist_t *redactnvl = NULL; - char *redactbook = NULL; + const char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); @@ -4403,7 +4551,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) uint64_t intval, compval; int err; - if (prop == ZPROP_INVAL) { + if (prop == ZPROP_USERPROP) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) @@ -4719,10 +4867,10 @@ propval_equals(nvpair_t *p1, nvpair_t *p2) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { - char *valstr1, *valstr2; + const char *valstr1, *valstr2; - VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); - VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); + VERIFY(nvpair_value_string(p1, &valstr1) == 0); + VERIFY(nvpair_value_string(p2, &valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; @@ -4787,6 +4935,11 @@ extract_delay_props(nvlist_t *props) static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, + /* + * Setting ZFS_PROP_SHARESMB requires the objset type to be + * known, which is not possible prior to receipt of raw sends. + */ + ZFS_PROP_SHARESMB, 0 }; int i; @@ -4838,9 +4991,9 @@ static boolean_t zfs_ioc_recv_inject_err; * encountered errors, if any. It's the callers responsibility to free. */ static int -zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, - nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, - boolean_t resumable, int input_fd, +zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin, + nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, + boolean_t force, boolean_t heal, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { @@ -4850,6 +5003,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; + nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; @@ -4865,7 +5019,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, return (SET_ERROR(EBADF)); noff = off = zfs_file_off(input_fp); - error = dmu_recv_begin(tofs, tosnap, begin_record, force, + error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) @@ -4949,7 +5103,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); - if (prop != ZPROP_INVAL) { + if (prop != ZPROP_USERPROP) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) @@ -4964,6 +5118,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); + inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); @@ -5021,6 +5176,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } + if (inherited_delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, + inherited_delayprops, *errors); + } } /* @@ -5040,6 +5199,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } + if (inherited_delayprops != NULL) { + ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); + nvlist_free(inherited_delayprops); + } *read_bytes = off - noff; #ifdef ZFS_DEBUG @@ -5176,15 +5339,16 @@ zfs_ioc_recv(zfs_cmd_t *zc) nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; - char *origin = NULL; + const char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || - strchr(zc->zc_value, '%')) + strchr(zc->zc_value, '%') != NULL) { return (SET_ERROR(EINVAL)); + } (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); @@ -5192,13 +5356,15 @@ zfs_ioc_recv(zfs_cmd_t *zc) if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &recvdprops)) != 0) - return (error); + zc->zc_iflags, &recvdprops)) != 0) { + goto out; + } if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - zc->zc_iflags, &localprops)) != 0) - return (error); + zc->zc_iflags, &localprops)) != 0) { + goto out; + } if (zc->zc_string[0]) origin = zc->zc_string; @@ -5208,10 +5374,8 @@ zfs_ioc_recv(zfs_cmd_t *zc) begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, - NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, + NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); - nvlist_free(recvdprops); - nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop @@ -5227,7 +5391,10 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = SET_ERROR(EINVAL); } +out: nvlist_free(errors); + nvlist_free(recvdprops); + nvlist_free(localprops); return (error); } @@ -5241,6 +5408,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) + * (optional) "heal" -> use send stream to heal data corruption * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused @@ -5261,6 +5429,7 @@ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, @@ -5276,11 +5445,12 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; - char *snapname; - char *origin = NULL; + const char *snapname; + const char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; + boolean_t heal; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; @@ -5291,8 +5461,9 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || - strchr(snapname, '%')) + strchr(snapname, '%') != NULL) { return (SET_ERROR(EINVAL)); + } (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); @@ -5310,36 +5481,47 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); + heal = nvlist_exists(innvl, "heal"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) - return (error); + goto out; error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) - return (error); + goto out; error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) - return (error); + goto out; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, - hidden_args, force, resumable, input_fd, begin_record, + hidden_args, force, heal, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); +out: nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); + nvlist_free(hidden_args); return (error); } +/* + * When stack space is limited, we write replication stream data to the target + * on a separate taskq thread, to make sure there's enough stack space. + */ +#ifndef HAVE_LARGE_STACKS +#define USE_SEND_TASKQ 1 +#endif + typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; @@ -5360,31 +5542,65 @@ dump_bytes_cb(void *arg) dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } +typedef struct dump_bytes_arg { + zfs_file_t *dba_fp; +#ifdef USE_SEND_TASKQ + taskq_t *dba_tq; + taskq_ent_t dba_tqent; +#endif +} dump_bytes_arg_t; + static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { + dump_bytes_arg_t *dba = (dump_bytes_arg_t *)arg; dump_bytes_io_t dbi; - dbi.dbi_fp = arg; + dbi.dbi_fp = dba->dba_fp; dbi.dbi_buf = buf; dbi.dbi_len = len; -#if defined(HAVE_LARGE_STACKS) - dump_bytes_cb(&dbi); +#ifdef USE_SEND_TASKQ + taskq_dispatch_ent(dba->dba_tq, dump_bytes_cb, &dbi, TQ_SLEEP, + &dba->dba_tqent); + taskq_wait(dba->dba_tq); #else - /* - * The vn_rdwr() call is performed in a taskq to ensure that there is - * always enough stack space to write safely to the target filesystem. - * The ZIO_TYPE_FREE threads are used because there can be a lot of - * them and they are used in vdev_file.c for a similar purpose. - */ - spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, - ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); -#endif /* HAVE_LARGE_STACKS */ + dump_bytes_cb(&dbi); +#endif return (dbi.dbi_err); } +static int +dump_bytes_init(dump_bytes_arg_t *dba, int fd, dmu_send_outparams_t *out) +{ + zfs_file_t *fp = zfs_file_get(fd); + if (fp == NULL) + return (SET_ERROR(EBADF)); + + dba->dba_fp = fp; +#ifdef USE_SEND_TASKQ + dba->dba_tq = taskq_create("z_send", 1, defclsyspri, 0, 0, 0); + taskq_init_ent(&dba->dba_tqent); +#endif + + memset(out, 0, sizeof (dmu_send_outparams_t)); + out->dso_outfunc = dump_bytes; + out->dso_arg = dba; + out->dso_dryrun = B_FALSE; + + return (0); +} + +static void +dump_bytes_fini(dump_bytes_arg_t *dba) +{ + zfs_file_put(dba->dba_fp); +#ifdef USE_SEND_TASKQ + taskq_destroy(dba->dba_tq); +#endif +} + /* * inputs: * zc_name name of snapshot to send @@ -5469,21 +5685,18 @@ zfs_ioc_send(zfs_cmd_t *zc) dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { - zfs_file_t *fp; - dmu_send_outparams_t out = {0}; - - if ((fp = zfs_file_get(zc->zc_cookie)) == NULL) - return (SET_ERROR(EBADF)); + dump_bytes_arg_t dba; + dmu_send_outparams_t out; + error = dump_bytes_init(&dba, zc->zc_cookie, &out); + if (error) + return (error); - off = zfs_file_off(fp); - out.dso_outfunc = dump_bytes; - out.dso_arg = fp; - out.dso_dryrun = B_FALSE; + off = zfs_file_off(dba.dba_fp); error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); - zfs_file_put(fp); + dump_bytes_fini(&dba); } return (error); } @@ -5585,17 +5798,12 @@ zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, - &count); - if (error == 0) - zc->zc_nvlist_dst_size = count; - else - zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); + &zc->zc_nvlist_dst_size); spa_close(spa, FTAG); @@ -5654,10 +5862,13 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. + * host may have imported the pool. Check for remote activity. */ - if (spa_multihost(spa) && spa_suspended(spa)) - return (SET_ERROR(EINVAL)); + if (spa_multihost(spa) && spa_suspended(spa) && + spa_mmp_remote_host_activity(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EREMOTEIO)); + } spa_vdev_state_enter(spa, SCL_NONE); @@ -5703,10 +5914,10 @@ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; -/* ARGSUSED */ static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { + (void) outnvl; spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; @@ -6005,10 +6216,6 @@ zfs_ioc_share(zfs_cmd_t *zc) return (SET_ERROR(ENOSYS)); } -ace_t full_access[] = { - {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} -}; - /* * inputs: * zc_name name of containing filesystem @@ -6117,10 +6324,10 @@ static const zfs_ioc_key_t zfs_keys_hold[] = { {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; -/* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { + (void) pool; nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; @@ -6133,7 +6340,7 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { - char *htag; + const char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) @@ -6169,10 +6376,10 @@ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; -/* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { + (void) args; return (dsl_dataset_get_holds(snapname, outnvl)); } @@ -6191,10 +6398,10 @@ static const zfs_ioc_key_t zfs_keys_release[] = { {"<snapname>...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; -/* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { + (void) pool; return (dsl_dataset_user_release(holds, errlist)); } @@ -6252,7 +6459,7 @@ zfs_ioc_events_next(zfs_cmd_t *zc) static int zfs_ioc_events_clear(zfs_cmd_t *zc) { - int count; + uint_t count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; @@ -6353,7 +6560,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; - char *firstsnap; + const char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); @@ -6428,15 +6635,14 @@ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; -/* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) outnvl; int error; offset_t off; - char *fromname = NULL; + const char *fromname = NULL; int fd; - zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; @@ -6444,7 +6650,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; - char *redactbook = NULL; + const char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); @@ -6461,28 +6667,28 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); - if ((fp = zfs_file_get(fd)) == NULL) - return (SET_ERROR(EBADF)); - - off = zfs_file_off(fp); + dump_bytes_arg_t dba; + dmu_send_outparams_t out; + error = dump_bytes_init(&dba, fd, &out); + if (error) + return (error); - dmu_send_outparams_t out = {0}; - out.dso_outfunc = dump_bytes; - out.dso_arg = fp; - out.dso_dryrun = B_FALSE; + off = zfs_file_off(dba.dba_fp); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); - zfs_file_put(fp); + dump_bytes_fini(&dba); + return (error); } -/* ARGSUSED */ static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { + (void) os, (void) buf; uint64_t *size = arg; + *size += len; return (0); } @@ -6533,8 +6739,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; - char *fromname = NULL; - char *redactlist_book = NULL; + const char *fromname = NULL; + const char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; @@ -6674,10 +6880,10 @@ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; -/* ARGSUSED */ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { + (void) onvl; int err; boolean_t rc, force = B_FALSE; spa_t *spa; @@ -6717,10 +6923,10 @@ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; -/* ARGSUSED */ static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) outnvl; int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; @@ -6759,10 +6965,10 @@ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; -/* ARGSUSED */ static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) innvl, (void) outnvl; int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { @@ -6780,7 +6986,7 @@ out: /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, - * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified + * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { @@ -6797,10 +7003,10 @@ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; -/* ARGSUSED */ static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { + (void) outnvl; int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; @@ -7029,7 +7235,7 @@ zfs_ioctl_init(void) ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, - zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME, + zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, @@ -7107,6 +7313,21 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); + zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS, + zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props, + ARRAY_SIZE(zfs_keys_vdev_get_props)); + + zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS, + zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); + + zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, + zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_NONE, B_TRUE, B_TRUE, + zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, @@ -7268,7 +7489,7 @@ zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { - char *name = nvpair_name(pair); + const char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; @@ -7359,7 +7580,7 @@ zfsdev_getminor(zfs_file_t *fp, minor_t *minorp) mutex_enter(&zfsdev_state_lock); - for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; @@ -7381,9 +7602,9 @@ zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; - for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { - smp_rmb(); + membar_consumer(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); @@ -7435,7 +7656,7 @@ zfsdev_state_init(void *priv) if (minor == 0) return (SET_ERROR(ENXIO)); - for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) break; zsprev = zs; @@ -7719,13 +7940,11 @@ zfs_kmod_init(void) zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); - zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); - zfsdev_state_list->zs_minor = -1; + zfsdev_state_listhead.zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; - tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); @@ -7747,13 +7966,14 @@ zfs_kmod_fini(void) mutex_destroy(&zfsdev_state_lock); - for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { + for (zs = &zfsdev_state_listhead; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); - kmem_free(zs, sizeof (zfsdev_state_t)); + if (zs != &zfsdev_state_listhead) + kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ @@ -7761,15 +7981,12 @@ zfs_kmod_fini(void) spa_fini(); zvol_fini(); - tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, - "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); +ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW, + "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); -ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, - "Maximum size in bytes of ZFS ioctl output that will be logged"); -/* END CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW, + "Maximum size in bytes of ZFS ioctl output that will be logged"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c index e248dc3cc4e8..fa4e7093ca46 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_log.c +++ b/sys/contrib/openzfs/module/zfs/zfs_log.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2018 by Delphix. All rights reserved. + * Copyright (c) 2022 by Pawel Jakub Dawidek */ @@ -107,86 +108,81 @@ zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) static void zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) { - uint32_t *bitmap; - uint64_t *attrs; - uint64_t *crtime; - xoptattr_t *xoap; - void *scanstamp; - int i; + xoptattr_t *xoap; xoap = xva_getxoptattr(xvap); ASSERT(xoap); lrattr->lr_attr_masksize = xvap->xva_mapsize; - bitmap = &lrattr->lr_attr_bitmap; - for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { + uint32_t *bitmap = &lrattr->lr_attr_bitmap; + for (int i = 0; i != xvap->xva_mapsize; i++, bitmap++) *bitmap = xvap->xva_reqattrmap[i]; - } - /* Now pack the attributes up in a single uint64_t */ - attrs = (uint64_t *)bitmap; - *attrs = 0; - crtime = attrs + 1; - bzero(crtime, 2 * sizeof (uint64_t)); - scanstamp = (caddr_t)(crtime + 2); - bzero(scanstamp, AV_SCANSTAMP_SZ); + lr_attr_end_t *end = (lr_attr_end_t *)bitmap; + end->lr_attr_attrs = 0; + end->lr_attr_crtime[0] = 0; + end->lr_attr_crtime[1] = 0; + memset(end->lr_attr_scanstamp, 0, AV_SCANSTAMP_SZ); + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) - *attrs |= (xoap->xoa_readonly == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_readonly == 0) ? 0 : XAT0_READONLY; if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) - *attrs |= (xoap->xoa_hidden == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_hidden == 0) ? 0 : XAT0_HIDDEN; if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) - *attrs |= (xoap->xoa_system == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_system == 0) ? 0 : XAT0_SYSTEM; if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) - *attrs |= (xoap->xoa_archive == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_archive == 0) ? 0 : XAT0_ARCHIVE; if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) - *attrs |= (xoap->xoa_immutable == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_immutable == 0) ? 0 : XAT0_IMMUTABLE; if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) - *attrs |= (xoap->xoa_nounlink == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_nounlink == 0) ? 0 : XAT0_NOUNLINK; if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) - *attrs |= (xoap->xoa_appendonly == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_appendonly == 0) ? 0 : XAT0_APPENDONLY; if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) - *attrs |= (xoap->xoa_opaque == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_opaque == 0) ? 0 : XAT0_APPENDONLY; if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) - *attrs |= (xoap->xoa_nodump == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_nodump == 0) ? 0 : XAT0_NODUMP; if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) - *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : XAT0_AV_QUARANTINED; if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) - *attrs |= (xoap->xoa_av_modified == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_av_modified == 0) ? 0 : XAT0_AV_MODIFIED; if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) - ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); + ZFS_TIME_ENCODE(&xoap->xoa_createtime, end->lr_attr_crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); - bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); + memcpy(end->lr_attr_scanstamp, xoap->xoa_av_scanstamp, + AV_SCANSTAMP_SZ); } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { /* * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid * at the same time, so we can share the same space. */ - bcopy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t)); + memcpy(end->lr_attr_scanstamp, &xoap->xoa_projid, + sizeof (uint64_t)); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) - *attrs |= (xoap->xoa_reparse == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_reparse == 0) ? 0 : XAT0_REPARSE; if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) - *attrs |= (xoap->xoa_offline == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_offline == 0) ? 0 : XAT0_OFFLINE; if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) - *attrs |= (xoap->xoa_sparse == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_sparse == 0) ? 0 : XAT0_SPARSE; if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) - *attrs |= (xoap->xoa_projinherit == 0) ? 0 : + end->lr_attr_attrs |= (xoap->xoa_projinherit == 0) ? 0 : XAT0_PROJINHERIT; } @@ -214,7 +210,7 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) if (fuidp->z_domain_str_sz != 0) { for (zdomain = list_head(&fuidp->z_domains); zdomain; zdomain = list_next(&fuidp->z_domains, zdomain)) { - bcopy((void *)zdomain->z_domain, start, + memcpy(start, zdomain->z_domain, strlen(zdomain->z_domain) + 1); start = (caddr_t)start + strlen(zdomain->z_domain) + 1; @@ -392,7 +388,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, else lracl->lr_acl_flags = 0; - bcopy(vsecp->vsa_aclentp, end, aclsize); + memcpy(end, vsecp->vsa_aclentp, aclsize); end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); } @@ -404,7 +400,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, /* * Now place file name in log record */ - bcopy(name, end, namesize); + memcpy(end, name, namesize); zil_itx_assign(zilog, itx, tx); } @@ -426,7 +422,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, itx = zil_itx_create(txtype, sizeof (*lr) + namesize); lr = (lr_remove_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; - bcopy(name, (char *)(lr + 1), namesize); + memcpy(lr + 1, name, namesize); itx->itx_oid = foid; @@ -462,7 +458,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr = (lr_link_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_link_obj = zp->z_id; - bcopy(name, (char *)(lr + 1), namesize); + memcpy(lr + 1, name, namesize); zil_itx_assign(zilog, itx, tx); } @@ -493,8 +489,31 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, sizeof (uint64_t)); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), lr->lr_crtime, sizeof (uint64_t) * 2); - bcopy(name, (char *)(lr + 1), namesize); - bcopy(link, (char *)(lr + 1) + namesize, linksize); + memcpy((char *)(lr + 1), name, namesize); + memcpy((char *)(lr + 1) + namesize, link, linksize); + + zil_itx_assign(zilog, itx, tx); +} + +static void +do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, + const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) +{ + itx_t *itx; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + memcpy((char *)(lr + 1), sname, snamesize); + memcpy((char *)(lr + 1) + snamesize, dname, dnamesize); + itx->itx_oid = szp->z_id; zil_itx_assign(zilog, itx, tx); } @@ -506,20 +525,73 @@ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) { + txtype |= TX_RENAME; + do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); +} + +/* + * Handles TX_RENAME_EXCHANGE transactions. + */ +void +zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, + znode_t *szp) +{ + txtype |= TX_RENAME_EXCHANGE; + do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); +} + +/* + * Handles TX_RENAME_WHITEOUT transactions. + * + * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call + * zfs_mknode() on replay which requires stashing bits as with TX_CREATE. + */ +void +zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, + znode_t *szp, znode_t *wzp) +{ itx_t *itx; - lr_rename_t *lr; + lr_rename_whiteout_t *lr; size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; if (zil_replaying(zilog, tx)) return; + txtype |= TX_RENAME_WHITEOUT; itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); - lr = (lr_rename_t *)&itx->itx_lr; - lr->lr_sdoid = sdzp->z_id; - lr->lr_tdoid = tdzp->z_id; - bcopy(sname, (char *)(lr + 1), snamesize); - bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + lr = (lr_rename_whiteout_t *)&itx->itx_lr; + lr->lr_rename.lr_sdoid = sdzp->z_id; + lr->lr_rename.lr_tdoid = tdzp->z_id; + + /* + * RENAME_WHITEOUT will create an entry at the source znode, so we need + * to store the same data that the equivalent call to zfs_log_create() + * would. + */ + lr->lr_wfoid = wzp->z_id; + LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT); + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen, + sizeof (uint64_t)); + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)), + lr->lr_wcrtime, sizeof (uint64_t) * 2); + lr->lr_wmode = wzp->z_mode; + lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp)); + lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp)); + + /* + * This rdev will always be makdevice(0, 0) but because the ZIL log and + * replay code needs to be platform independent (and there is no + * platform independent makdev()) we need to copy the one created + * during the rename operation. + */ + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev, + sizeof (lr->lr_wrdev)); + + memcpy((char *)(lr + 1), sname, snamesize); + memcpy((char *)(lr + 1) + snamesize, dname, dnamesize); itx->itx_oid = szp->z_id; zil_itx_assign(zilog, itx, tx); @@ -530,17 +602,16 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, * called as soon as the write is on stable storage (be it via a DMU sync or a * ZIL commit). */ -long zfs_immediate_write_sz = 32768; +static int64_t zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag, + znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, zil_callback_t callback, void *callback_data) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); uint32_t blocksize = zp->z_blksz; itx_wr_state_t write_state; - uintptr_t fsync_cnt; uint64_t gen = 0; ssize_t size = resid; @@ -556,15 +627,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, else if (!spa_has_slogs(zilog->zl_spa) && resid >= zfs_immediate_write_sz) write_state = WR_INDIRECT; - else if (ioflag & (O_SYNC | O_DSYNC)) + else if (commit) write_state = WR_COPIED; else write_state = WR_NEED_COPY; - if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { - (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); - } - (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, sizeof (gen)); @@ -615,12 +682,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, BP_ZERO(&lr->lr_blkptr); itx->itx_private = ZTOZSB(zp); + itx->itx_sync = (zp->z_sync_cnt != 0); itx->itx_gen = gen; - if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) && - (fsync_cnt == 0)) - itx->itx_sync = B_FALSE; - itx->itx_callback = callback; itx->itx_callback_data = callback_data; zil_itx_assign(zilog, itx, tx); @@ -721,6 +785,40 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, } /* + * Handles TX_SETSAXATTR transactions. + */ +void +zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, const char *name, const void *value, size_t size) +{ + itx_t *itx; + lr_setsaxattr_t *lr; + size_t recsize = sizeof (lr_setsaxattr_t); + void *xattrstart; + int namelen; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + namelen = strlen(name) + 1; + recsize += (namelen + size); + itx = zil_itx_create(txtype, recsize); + lr = (lr_setsaxattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + xattrstart = (char *)(lr + 1); + memcpy(xattrstart, name, namelen); + if (value != NULL) { + memcpy((char *)xattrstart + namelen, value, size); + lr->lr_size = size; + } else { + lr->lr_size = 0; + } + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + +/* * Handles TX_ACL transactions. */ void @@ -768,11 +866,11 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, if (txtype == TX_ACL_V0) { lrv0 = (lr_acl_v0_t *)lr; - bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); + memcpy(lrv0 + 1, vsecp->vsa_aclentp, aclbytes); } else { void *start = (ace_t *)(lr + 1); - bcopy(vsecp->vsa_aclentp, start, aclbytes); + memcpy(start, vsecp->vsa_aclentp, aclbytes); start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); @@ -786,7 +884,52 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, zil_itx_assign(zilog, itx, tx); } -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW, +/* + * Handles TX_CLONE_RANGE transactions. + */ +void +zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, + uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, + size_t nbps) +{ + itx_t *itx; + lr_clone_range_t *lr; + uint64_t partlen, max_log_data; + size_t partnbps; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); + + while (nbps > 0) { + partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); + partlen = MIN(partlen, len); + + itx = zil_itx_create(txtype, + sizeof (*lr) + sizeof (bps[0]) * partnbps); + lr = (lr_clone_range_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = partlen; + lr->lr_blksz = blksz; + lr->lr_nbps = partnbps; + memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); + + itx->itx_sync = (zp->z_sync_cnt != 0); + + zil_itx_assign(zilog, itx, tx); + + bps += partnbps; + ASSERT3U(nbps, >=, partnbps); + nbps -= partnbps; + off += partlen; + ASSERT3U(len, >=, partlen); + len -= partlen; + } +} + +ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW, "Largest data block to write to zil"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c index 7c56dd9c97f5..7bf804b67790 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_onexit.c +++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -87,8 +87,7 @@ zfs_onexit_destroy(zfs_onexit_t *zo) zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); - while ((ap = list_head(&zo->zo_actions)) != NULL) { - list_remove(&zo->zo_actions, ap); + while ((ap = list_remove_head(&zo->zo_actions)) != NULL) { mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); @@ -151,7 +150,7 @@ zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - uint64_t *action_handle) + uintptr_t *action_handle) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; @@ -170,7 +169,7 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, list_insert_tail(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); if (action_handle) - *action_handle = (uint64_t)(uintptr_t)ap; + *action_handle = (uintptr_t)ap; return (0); } diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c index e61db5c7ab83..9b351eefc04e 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_quota.c +++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,8 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. - * All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. diff --git a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c index b18b480ce527..091562ca6852 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c index e6ed3e738e40..2e0af60f6db4 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_replay.c +++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 Cyril Plisko. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include <sys/types.h> @@ -47,6 +48,8 @@ #include <sys/atomic.h> #include <sys/cred.h> #include <sys/zpl.h> +#include <sys/dmu_objset.h> +#include <sys/zfeature.h> /* * NB: FreeBSD expects to be able to do vnode locking in lookup and @@ -68,7 +71,7 @@ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { - bzero(vap, sizeof (*vap)); + memset(vap, 0, sizeof (*vap)); vap->va_mask = (uint_t)mask; vap->va_mode = mode; #if defined(__FreeBSD__) || defined(__APPLE__) @@ -80,10 +83,10 @@ zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, vap->va_nodeid = nodeid; } -/* ARGSUSED */ static int zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap) { + (void) arg1, (void) arg2, (void) byteswap; return (SET_ERROR(ENOTSUP)); } @@ -141,13 +144,13 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); - bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); + memcpy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { /* * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid * at the same time, so we can share the same space. */ - bcopy(scanstamp, &xoap->xoa_projid, sizeof (uint64_t)); + memcpy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t)); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); @@ -306,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); @@ -362,7 +367,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); - fallthrough; + zfs_fallthrough; case TX_CREATE_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); @@ -384,8 +389,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) lr->lr_uid, lr->lr_gid); } +#if defined(__linux__) + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, &vsec, zfs_init_idmap); +#else error = zfs_create(dzp, name, &xva.xva_vattr, - 0, 0, &zp, kcred, vflg, &vsec); + 0, 0, &zp, kcred, vflg, &vsec, NULL); +#endif break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); @@ -394,7 +404,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); - fallthrough; + zfs_fallthrough; case TX_MKDIR_ACL_ATTR: if (name == NULL) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); @@ -414,8 +424,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } +#if defined(__linux__) error = zfs_mkdir(dzp, name, &xva.xva_vattr, - &zp, kcred, vflg, &vsec); + &zp, kcred, vflg, &vsec, zfs_init_idmap); +#else + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, &vsec, NULL); +#endif break; default: error = SET_ERROR(ENOTSUP); @@ -457,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -500,9 +517,9 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) * * The _ATTR versions will grab the fuid info in their subcases. */ - if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && - (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && - (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { + if (txtype != TX_SYMLINK && + txtype != TX_MKDIR_ATTR && + txtype != TX_CREATE_ATTR) { start = (lr + 1); zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, @@ -519,14 +536,19 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; - fallthrough; + zfs_fallthrough; case TX_CREATE: if (name == NULL) name = (char *)start; +#if defined(__linux__) + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, NULL, zfs_init_idmap); +#else error = zfs_create(dzp, name, &xva.xva_vattr, - 0, 0, &zp, kcred, vflg, NULL); + 0, 0, &zp, kcred, vflg, NULL, NULL); +#endif break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); @@ -537,14 +559,20 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); name = (char *)start; - fallthrough; + zfs_fallthrough; case TX_MKDIR: if (name == NULL) name = (char *)(lr + 1); +#if defined(__linux__) error = zfs_mkdir(dzp, name, &xva.xva_vattr, - &zp, kcred, vflg, NULL); + &zp, kcred, vflg, NULL, zfs_init_idmap); +#else + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, NULL, NULL); +#endif + break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred); @@ -552,8 +580,13 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; +#if defined(__linux__) + error = zfs_symlink(dzp, name, &xva.xva_vattr, + link, &zp, kcred, vflg, zfs_init_idmap); +#else error = zfs_symlink(dzp, name, &xva.xva_vattr, - link, &zp, kcred, vflg); + link, &zp, kcred, vflg, NULL); +#endif break; default: error = SET_ERROR(ENOTSUP); @@ -584,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -619,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -641,18 +678,21 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) } static int -zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname, + char *tname, uint64_t rflags, vattr_t *wo_vap) { - zfsvfs_t *zfsvfs = arg1; - lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; - int error; - int vflg = 0; + int error, vflg = 0; - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); + /* Only Linux currently supports RENAME_* flags. */ +#ifdef __linux__ + VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT)); + + /* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */ + VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); +#else + VERIFY0(rflags); +#endif if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); @@ -665,7 +705,13 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg); +#if defined(__linux__) + error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, + wo_vap, zfs_init_idmap); +#else + error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, + wo_vap, NULL); +#endif zrele(tdzp); zrele(sdzp); @@ -673,6 +719,92 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) } static int +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL)); +} + +static int +zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap) +{ +#ifdef __linux__ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE, + NULL)); +#else + return (SET_ERROR(ENOTSUP)); +#endif +} + +static int +zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) +{ +#ifdef __linux__ + zfsvfs_t *zfsvfs = arg1; + lr_rename_whiteout_t *lr = arg2; + int error; + /* For the whiteout file. */ + xvattr_t xva; + uint64_t objid; + uint64_t dnodesize; + + ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr)); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + objid = LR_FOID_GET_OBJ(lr->lr_wfoid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT; + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, + lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid); + + /* + * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which + * assigns the object's creation time, generation number, and dnode + * slot count. The generic zfs_rename() has no concept of these + * attributes, so we smuggle the values inside the vattr's otherwise + * unused va_ctime, va_nblocks, and va_fsid fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime); + xva.xva_vattr.va_nblocks = lr->lr_wgen; + xva.xva_vattr.va_fsid = dnodesize; + + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) + return (error); + + /* sname and tname follow lr_rename_whiteout_t */ + char *sname = (char *)(lr + 1); + char *tname = sname + strlen(sname) + 1; + return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname, + RENAME_WHITEOUT, &xva.xva_vattr)); +#else + return (SET_ERROR(ENOTSUP)); +#endif +} + +static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; @@ -682,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t eod, offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -745,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t end; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -789,16 +925,17 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) zfsvfs_t *zfsvfs = arg1; lr_truncate_t *lr = arg2; znode_t *zp; - flock64_t fl; + flock64_t fl = {0}; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - bzero(&fl, sizeof (fl)); fl.l_type = F_WRLCK; fl.l_whence = SEEK_SET; fl.l_start = lr->lr_offset; @@ -823,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) int error; void *start; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + xva_init(&xva); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -859,7 +998,11 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); - error = zfs_setattr(zp, vap, 0, kcred); +#if defined(__linux__) + error = zfs_setattr(zp, vap, 0, kcred, zfs_init_idmap); +#else + error = zfs_setattr(zp, vap, 0, kcred, NULL); +#endif zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; @@ -869,15 +1012,102 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) } static int +zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_setsaxattr_t *lr = arg2; + znode_t *zp; + nvlist_t *nvl; + size_t sa_size; + char *name; + char *value; + size_t size; + int error = 0; + + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size); + + ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa, + SPA_FEATURE_ZILSAXATTR)); + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + goto out; + + ASSERT(zp->z_xattr_cached); + nvl = zp->z_xattr_cached; + + /* Get xattr name, value and size from log record */ + size = lr->lr_size; + name = (char *)(lr + 1); + if (size == 0) { + value = NULL; + error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + } else { + value = name + strlen(name) + 1; + /* Limited to 32k to keep nvpair memory allocations small */ + if (size > DXATTR_MAX_ENTRY_SIZE) { + error = SET_ERROR(EFBIG); + goto out; + } + + /* Prevent the DXATTR SA from consuming the entire SA region */ + error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error) + goto out; + + if (sa_size > DXATTR_MAX_SA_SIZE) { + error = SET_ERROR(EFBIG); + goto out; + } + + error = nvlist_add_byte_array(nvl, name, (uchar_t *)value, + size); + } + + /* + * Update the SA for additions, modifications, and removals. On + * error drop the inconsistent cached version of the nvlist, it + * will be reconstructed from the ARC when next accessed. + */ + if (error == 0) + error = zfs_sa_set_xattr(zp, name, value, size); + + if (error) { + nvlist_free(nvl); + zp->z_xattr_cached = NULL; + } + +out: + rw_exit(&zp->z_xattr_lock); + zrele(zp); + return (error); +} + +static int zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_acl_v0_t *lr = arg2; ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ - vsecattr_t vsa; + vsecattr_t vsa = {0}; znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + + sizeof (ace_t) * lr->lr_aclcnt); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_oldace_byteswap(ace, lr->lr_aclcnt); @@ -886,7 +1116,6 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; @@ -920,10 +1149,13 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) zfsvfs_t *zfsvfs = arg1; lr_acl_t *lr = arg2; ace_t *ace = (ace_t *)(lr + 1); - vsecattr_t vsa; + vsecattr_t vsa = {0}; znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); @@ -937,7 +1169,6 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; vsa.vsa_aclcnt = lr->lr_aclcnt; vsa.vsa_aclentp = ace; @@ -964,10 +1195,42 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) return (error); } +static int +zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_clone_range_t *lr = arg2; + znode_t *zp; + int error; + + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * Clones can be logged out of order, so don't be surprised if + * the file is gone - just return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length, + lr->lr_blksz, lr->lr_bps, lr->lr_nbps); + + zrele(zp); + return (error); +} + /* * Callback vectors for replaying records */ -zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { +zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_error, /* no such type */ zfs_replay_create, /* TX_CREATE */ zfs_replay_create, /* TX_MKDIR */ @@ -989,4 +1252,8 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ + zfs_replay_setsaxattr, /* TX_SETSAXATTR */ + zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */ + zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */ + zfs_replay_clone_range, /* TX_CLONE_RANGE */ }; diff --git a/sys/contrib/openzfs/module/zfs/zfs_rlock.c b/sys/contrib/openzfs/module/zfs/zfs_rlock.c index 06a5e031a7df..f42661df82e4 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_rlock.c +++ b/sys/contrib/openzfs/module/zfs/zfs_rlock.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/zfs_sa.c b/sys/contrib/openzfs/module/zfs/zfs_sa.c index 67be131da63b..fb2443b756f8 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_sa.c +++ b/sys/contrib/openzfs/module/zfs/zfs_sa.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -29,6 +29,7 @@ #include <sys/zfs_sa.h> #include <sys/dmu_objset.h> #include <sys/sa_impl.h> +#include <sys/zfeature.h> /* * ZPL attribute registration table. @@ -43,7 +44,7 @@ * this version of ZFS won't change or delete them. */ -sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { +const sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, @@ -69,7 +70,10 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { {NULL, 0, 0, 0} }; + #ifdef _KERNEL +static int zfs_zil_saxattr = 1; + int zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio) { @@ -103,8 +107,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx)); if (len) { - bcopy(link, (caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, len); + memcpy((caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, link, len); } } else { dmu_buf_t *dbp; @@ -116,7 +120,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); + memcpy(dbp->db_data, link, len); dmu_buf_rele(dbp, FTAG); } } @@ -219,13 +223,14 @@ zfs_sa_get_xattr(znode_t *zp) } int -zfs_sa_set_xattr(znode_t *zp) +zfs_sa_set_xattr(znode_t *zp, const char *name, const void *value, size_t vsize) { zfsvfs_t *zfsvfs = ZTOZSB(zp); + zilog_t *zilog; dmu_tx_t *tx; char *obj; size_t size; - int error; + int error, logsaxattr = 0; ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ASSERT(zp->z_xattr_cached); @@ -244,6 +249,17 @@ zfs_sa_set_xattr(znode_t *zp) if (error) goto out_free; + zilog = zfsvfs->z_log; + + /* + * Users enable ZIL logging of xattr=sa operations by enabling the + * SPA_FEATURE_ZILSAXATTR feature on the pool. Feature is activated + * during zil_process_commit_list/zil_create, if enabled. + */ + if (spa_feature_is_enabled(zfsvfs->z_os->os_spa, + SPA_FEATURE_ZILSAXATTR) && zfs_zil_saxattr) + logsaxattr = 1; + tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa_create(tx, size); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); @@ -256,6 +272,10 @@ zfs_sa_set_xattr(znode_t *zp) sa_bulk_attr_t bulk[2]; uint64_t ctime[2]; + if (logsaxattr) + zfs_log_setsaxattr(zilog, tx, TX_SETSAXATTR, zp, name, + value, vsize); + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs), NULL, obj, size); @@ -264,6 +284,8 @@ zfs_sa_set_xattr(znode_t *zp) VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); + if (logsaxattr && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); } out_free: vmem_free(obj, size); @@ -396,8 +418,9 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) /* if scanstamp then add scanstamp */ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { - bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - scanstamp, AV_SCANSTAMP_SZ); + memcpy(scanstamp, + (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + AV_SCANSTAMP_SZ); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; @@ -433,6 +456,9 @@ zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) } } +ZFS_MODULE_PARAM(zfs, zfs_, zil_saxattr, INT, ZMOD_RW, + "Disable xattr=sa extended attribute logging in ZIL by settng 0."); + EXPORT_SYMBOL(zfs_attr_table); EXPORT_SYMBOL(zfs_sa_readlink); EXPORT_SYMBOL(zfs_sa_symlink); diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 7cbb70f499af..f3db953eab46 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ /* Portions Copyright 2007 Jeremy Teo */ @@ -46,34 +47,53 @@ #include <sys/fs/zfs.h> #include <sys/dmu.h> #include <sys/dmu_objset.h> +#include <sys/dsl_crypt.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/dbuf.h> #include <sys/policy.h> +#include <sys/zfeature.h> #include <sys/zfs_vnops.h> #include <sys/zfs_quota.h> #include <sys/zfs_vfsops.h> #include <sys/zfs_znode.h> +/* + * Enable the experimental block cloning feature. If this setting is 0, then + * even if feature@block_cloning is enabled, attempts to clone blocks will act + * as though the feature is disabled. + */ +int zfs_bclone_enabled = 1; -static ulong_t zfs_fsync_sync_cnt = 4; +/* + * When set zfs_clone_range() waits for dirty data to be written to disk. + * This allows the clone operation to reliably succeed when a file is modified + * and then immediately cloned. For small files this may be slower than making + * a copy of the file and is therefore not the default. However, in certain + * scenarios this behavior may be desirable so a tunable is provided. + */ +static int zfs_bclone_wait_dirty = 0; + +/* + * Maximum bytes to read per chunk in zfs_read(). + */ +static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { + int error = 0; zfsvfs_t *zfsvfs = ZTOZSB(zp); - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); + atomic_dec_32(&zp->z_sync_writes_cnt); + zfs_exit(zfsvfs, FTAG); } - tsd_set(zfs_fsyncer_key, NULL); - - return (0); + return (error); } @@ -102,10 +122,10 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) hole = B_FALSE; /* Flush any mmap()'d data to disk */ - if (zn_has_cached_data(zp)) - zn_flush_cached_data(zp, B_FALSE); + if (zn_has_cached_data(zp, 0, file_sz - 1)) + zn_flush_cached_data(zp, B_TRUE); - lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER); + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); zfs_rangelock_exit(lr); @@ -144,37 +164,44 @@ zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); error = zfs_holey_common(zp, cmd, off); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ -/*ARGSUSED*/ int zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); +#if defined(__linux__) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, + zfs_init_idmap); +#else + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, + NULL); +#endif else - error = zfs_zaccess_rwx(zp, mode, flag, cr); +#if defined(__linux__) + error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); +#else + error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); +#endif - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } -static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ - /* * Read bytes from specified file into supplied buffer. * @@ -192,25 +219,25 @@ static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ * Side Effects: * inode - atime updated if byte count > 0 */ -/* ARGSUSED */ int zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { + (void) cr; int error = 0; boolean_t frsync = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } /* We don't copy out anything useful for directories. */ if (Z_ISDIR(ZTOTYPE(zp))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } @@ -218,7 +245,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * Validate file offset */ if (zfs_uio_offset(uio) < (offset_t)0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -226,7 +253,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * Fasttrack empty reads */ if (zfs_uio_resid(uio) == 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -275,7 +302,8 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = mappedread_sf(zp, nbytes, uio); else #endif - if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), @@ -311,10 +339,65 @@ out: zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } +static void +zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, + uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) +{ + zilog_t *zilog = zfsvfs->z_log; + const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); + + ASSERT(clear_setid_bits_txgp != NULL); + ASSERT(tx != NULL); + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to do this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(zp, cr, + ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { + uint64_t newmode; + + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + + mutex_exit(&zp->z_acl_lock); + + /* + * Make sure SUID/SGID bits will be removed when we replay the + * log. If the setid bits are keep coming back, don't log more + * than one TX_SETATTR per transaction group. + */ + if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { + vattr_t va = {0}; + + va.va_mask = ATTR_MODE; + va.va_nodeid = zp->z_id; + va.va_mode = newmode; + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, + ATTR_MODE, NULL); + *clear_setid_bits_txgp = dmu_tx_get_txg(tx); + } + } else { + mutex_exit(&zp->z_acl_lock); + } +} + /* * Write the bytes to a file. * @@ -333,13 +416,12 @@ out: * Timestamps: * ip - ctime|mtime updated if byte count > 0 */ - -/* ARGSUSED */ int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { - int error = 0; + int error = 0, error1; ssize_t start_resid = zfs_uio_resid(uio); + uint64_t clear_setid_bits_txg = 0; /* * Fasttrack empty write @@ -349,8 +431,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (0); zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); sa_bulk_attr_t bulk[4]; int count = 0; @@ -367,7 +449,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } @@ -379,7 +461,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && (zfs_uio_offset(uio) < zp->z_size))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } @@ -388,19 +470,17 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) */ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } - const uint64_t max_blksz = zfsvfs->z_max_blksz; - /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. - * Skip this if uio contains loaned arc_buf. */ - if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { - ZFS_EXIT(zfsvfs); + ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); + if (zfs_uio_prefaultpages(pfbytes, uio)) { + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFAULT)); } @@ -433,9 +513,9 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } - if (zn_rlimit_fsize(zp, uio)) { + if (zn_rlimit_fsize_uio(zp, uio)) { zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } @@ -443,7 +523,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (woff >= limit) { zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } @@ -452,6 +532,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; + boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || + (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); @@ -474,10 +556,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) break; } + uint64_t blksz; + if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { + if (zp->z_blksz > zfsvfs->z_max_blksz && + !ISP2(zp->z_blksz)) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + blksz = 1 << highbit64(zp->z_blksz); + } else { + blksz = zfsvfs->z_max_blksz; + } + blksz = MIN(blksz, P2ROUNDUP(end_size, + SPA_MINBLOCKSIZE)); + blksz = MAX(blksz, zp->z_blksz); + } else { + blksz = zp->z_blksz; + } + arc_buf_t *abuf = NULL; - if (n >= max_blksz && woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { + ssize_t nbytes = n; + if (n >= blksz && woff >= zp->z_size && + P2PHASE(woff, blksz) == 0 && + (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter @@ -485,18 +588,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ - size_t cbytes; - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); + blksz); ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = zfs_uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { + ASSERT(arc_buf_size(abuf) == blksz); + if ((error = zfs_uiocopy(abuf->b_data, blksz, + UIO_WRITE, uio, &nbytes))) { dmu_return_arcbuf(abuf); break; } - ASSERT3S(cbytes, ==, max_blksz); + ASSERT3S(nbytes, ==, blksz); + } else { + nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - + P2PHASE(woff, blksz)); + if (pfbytes < nbytes) { + if (zfs_uio_prefaultpages(nbytes, uio)) { + error = SET_ERROR(EFAULT); + break; + } + pfbytes = nbytes; + } } /* @@ -506,8 +617,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); @@ -519,37 +629,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) } /* + * NB: We must call zfs_clear_setid_bits_if_necessary before + * committing the transaction! + */ + + /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); + zfs_grow_blocksize(zp, blksz, tx); zfs_rangelock_reduce(lr, woff, n); } - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - const ssize_t nbytes = - MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); @@ -559,6 +653,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { + zfs_clear_setid_bits_if_necessary(zfsvfs, zp, + cr, &clear_setid_bits_txg, tx); dmu_tx_commit(tx); /* * Account for partial writes before @@ -567,30 +663,23 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ - if (tx_bytes != zfs_uio_resid(uio)) - n -= tx_bytes - zfs_uio_resid(uio); - if (zfs_uio_prefaultpages(MIN(n, max_blksz), - uio)) { - break; - } + n -= tx_bytes - zfs_uio_resid(uio); + pfbytes -= tx_bytes - zfs_uio_resid(uio); continue; } #endif - if (error != 0) { + /* + * On FreeBSD, EFAULT should be propagated back to the + * VFS, which will handle faulting and will retry. + */ + if (error != 0 && error != EFAULT) { + zfs_clear_setid_bits_if_necessary(zfsvfs, zp, + cr, &clear_setid_bits_txg, tx); dmu_tx_commit(tx); break; } tx_bytes -= zfs_uio_resid(uio); } else { - /* Implied by abuf != NULL: */ - ASSERT3S(n, >=, max_blksz); - ASSERT0(P2PHASE(woff, max_blksz)); - /* - * We can simplify nbytes to MIN(n, max_blksz) since - * P2PHASE(woff, max_blksz) is 0, and knowing - * n >= max_blksz lets us simplify further: - */ - ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. @@ -601,6 +690,13 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) error = dmu_assign_arcbuf_by_dbuf( sa_get_db(zp->z_sa_hdl), woff, abuf, tx); if (error != 0) { + /* + * XXX This might not be necessary if + * dmu_assign_arcbuf_by_dbuf is guaranteed + * to be atomic. + */ + zfs_clear_setid_bits_if_necessary(zfsvfs, zp, + cr, &clear_setid_bits_txg, tx); dmu_return_arcbuf(abuf); dmu_tx_commit(tx); break; @@ -609,7 +705,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } - if (tx_bytes && zn_has_cached_data(zp) && + if (tx_bytes && + zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } @@ -626,30 +723,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) break; } - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the execute bits is set. - * - * It would be nice to do this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(zp, cr, - ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); + zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, + &clear_setid_bits_txg, tx); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); @@ -660,7 +735,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, zfs_uio_offset(uio)); - ASSERT(error == 0); + ASSERT(error == 0 || error == EFAULT); } /* * If we are replaying and eof is non zero then force @@ -670,23 +745,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + if (error1 != 0) + /* Avoid clobbering EFAULT. */ + error = error1; - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + /* + * NB: During replay, the TX_SETATTR record logged by + * zfs_clear_setid_bits_if_necessary must precede any of + * the TX_WRITE records logged here. + */ + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, NULL, NULL); + dmu_tx_commit(tx); if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; - - if (n > 0) { - if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { - error = SET_ERROR(EFAULT); - break; - } - } + pfbytes -= nbytes; } zfs_znode_update_vfs(zp); @@ -699,23 +777,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) */ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } - if (ioflag & (O_SYNC | O_DSYNC) || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + if (commit) zil_commit(zilog, zp->z_id); const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } -/*ARGSUSED*/ int zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { @@ -723,32 +799,31 @@ zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } -/*ARGSUSED*/ int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + zilog_t *zilog; + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + zilog = zfsvfs->z_log; error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -777,7 +852,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* @@ -804,7 +878,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, return (SET_ERROR(ENOENT)); } - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; @@ -827,6 +901,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated @@ -855,8 +930,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } #endif if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread(os, object, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -901,10 +976,10 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, } -/* ARGSUSED */ static void zfs_get_done(zgd_t *zgd, int error) { + (void) error; znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) @@ -921,6 +996,551 @@ zfs_get_done(zgd_t *zgd, int error) kmem_free(zgd, sizeof (zgd_t)); } +static int +zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) +{ + int error; + + /* Swap. Not sure if the order of zfs_enter()s is important. */ + if (zfsvfs1 > zfsvfs2) { + zfsvfs_t *tmpzfsvfs; + + tmpzfsvfs = zfsvfs2; + zfsvfs2 = zfsvfs1; + zfsvfs1 = tmpzfsvfs; + } + + error = zfs_enter(zfsvfs1, tag); + if (error != 0) + return (error); + if (zfsvfs1 != zfsvfs2) { + error = zfs_enter(zfsvfs2, tag); + if (error != 0) { + zfs_exit(zfsvfs1, tag); + return (error); + } + } + + return (0); +} + +static void +zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) +{ + + zfs_exit(zfsvfs1, tag); + if (zfsvfs1 != zfsvfs2) + zfs_exit(zfsvfs2, tag); +} + +/* + * We split each clone request in chunks that can fit into a single ZIL + * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning + * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives + * us room for storing 1022 block pointers. + * + * On success, the function return the number of bytes copied in *lenp. + * Note, it doesn't return how much bytes are left to be copied. + * On errors which are caused by any file system limitations or + * brt limitations `EINVAL` is returned. In the most cases a user + * requested bad parameters, it could be possible to clone the file but + * some parameters don't match the requirements. + */ +int +zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, + uint64_t *outoffp, uint64_t *lenp, cred_t *cr) +{ + zfsvfs_t *inzfsvfs, *outzfsvfs; + objset_t *inos, *outos; + zfs_locked_range_t *inlr, *outlr; + dmu_buf_impl_t *db; + dmu_tx_t *tx; + zilog_t *zilog; + uint64_t inoff, outoff, len, done; + uint64_t outsize, size; + int error; + int count = 0; + sa_bulk_attr_t bulk[3]; + uint64_t mtime[2], ctime[2]; + uint64_t uid, gid, projid; + blkptr_t *bps; + size_t maxblocks, nbps; + uint_t inblksz; + uint64_t clear_setid_bits_txg = 0; + uint64_t last_synced_txg = 0; + + inoff = *inoffp; + outoff = *outoffp; + len = *lenp; + done = 0; + + inzfsvfs = ZTOZSB(inzp); + outzfsvfs = ZTOZSB(outzp); + + /* + * We need to call zfs_enter() potentially on two different datasets, + * so we need a dedicated function for that. + */ + error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); + if (error != 0) + return (error); + + inos = inzfsvfs->z_os; + outos = outzfsvfs->z_os; + + /* + * Both source and destination have to belong to the same storage pool. + */ + if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* + * outos and inos belongs to the same storage pool. + * see a few lines above, only one check. + */ + if (!spa_feature_is_enabled(dmu_objset_spa(outos), + SPA_FEATURE_BLOCK_CLONING)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EOPNOTSUPP)); + } + + ASSERT(!outzfsvfs->z_replay); + + /* + * Block cloning from an unencrypted dataset into an encrypted + * dataset and vice versa is not supported. + */ + if (inos->os_encrypted != outos->os_encrypted) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* + * Cloning across encrypted datasets is possible only if they + * share the same master key. + */ + if (inos != outos && inos->os_encrypted && + !dmu_objset_crypto_key_equal(inos, outos)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + + error = zfs_verify_zp(inzp); + if (error == 0) + error = zfs_verify_zp(outzp); + if (error != 0) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (error); + } + + /* + * We don't copy source file's flags that's why we don't allow to clone + * files that are in quarantine. + */ + if (inzp->z_pflags & ZFS_AV_QUARANTINED) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EACCES)); + } + + if (inoff >= inzp->z_size) { + *lenp = 0; + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (0); + } + if (len > inzp->z_size - inoff) { + len = inzp->z_size - inoff; + } + if (len == 0) { + *lenp = 0; + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (0); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(outzfsvfs)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() + */ + if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EPERM)); + } + + /* + * No overlapping if we are cloning within the same file. + */ + if (inzp == outzp) { + if (inoff < outoff + len && outoff < inoff + len) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EINVAL)); + } + } + + /* Flush any mmap()'d data to disk */ + if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) + zn_flush_cached_data(inzp, B_TRUE); + + /* + * Maintain predictable lock order. + */ + if (inzp < outzp || (inzp == outzp && inoff < outoff)) { + inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, + RL_READER); + outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, + RL_WRITER); + } else { + outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, + RL_WRITER); + inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, + RL_READER); + } + + inblksz = inzp->z_blksz; + + /* + * We cannot clone into a file with different block size if we can't + * grow it (block size is already bigger, has more than one block, or + * not locked for growth). There are other possible reasons for the + * grow to fail, but we cover what we can before opening transaction + * and the rest detect after we try to do it. + */ + if (inblksz < outzp->z_blksz) { + error = SET_ERROR(EINVAL); + goto unlock; + } + if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || + outlr->lr_length != UINT64_MAX)) { + error = SET_ERROR(EINVAL); + goto unlock; + } + + /* + * Block size must be power-of-2 if destination offset != 0. + * There can be no multiple blocks of non-power-of-2 size. + */ + if (outoff != 0 && !ISP2(inblksz)) { + error = SET_ERROR(EINVAL); + goto unlock; + } + + /* + * Offsets and len must be at block boundries. + */ + if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { + error = SET_ERROR(EINVAL); + goto unlock; + } + /* + * Length must be multipe of blksz, except for the end of the file. + */ + if ((len % inblksz) != 0 && + (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { + error = SET_ERROR(EINVAL); + goto unlock; + } + + /* + * If we are copying only one block and it is smaller than recordsize + * property, do not allow destination to grow beyond one block if it + * is not there yet. Otherwise the destination will get stuck with + * that block size forever, that can be as small as 512 bytes, no + * matter how big the destination grow later. + */ + if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && + outzp->z_size <= inblksz && outoff + len > inblksz) { + error = SET_ERROR(EINVAL); + goto unlock; + } + + error = zn_rlimit_fsize(outoff + len); + if (error != 0) { + goto unlock; + } + + if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { + error = SET_ERROR(EFBIG); + goto unlock; + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, + &outzp->z_size, 8); + + zilog = outzfsvfs->z_log; + maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / + sizeof (bps[0]); + + uid = KUID_TO_SUID(ZTOUID(outzp)); + gid = KGID_TO_SGID(ZTOGID(outzp)); + projid = outzp->z_projid; + + bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); + + /* + * Clone the file in reasonable size chunks. Each chunk is cloned + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (len > 0) { + size = MIN(inblksz * maxblocks, len); + + if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, + uid) || + zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, + gid) || + (projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, + projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + nbps = maxblocks; + last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); + error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, + &nbps); + if (error != 0) { + /* + * If we are trying to clone a block that was created + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. + */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool(inos), + last_synced_txg + 1); + continue; + } + + break; + } + + /* + * Start a transaction. + */ + tx = dmu_tx_create(outos); + dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); + db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, outzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + break; + } + + /* + * Copy source znode's block size. This is done only if the + * whole znode is locked (see zfs_rangelock_cb()) and only + * on the first iteration since zfs_rangelock_reduce() will + * shrink down lr_length to the appropriate size. + */ + if (outlr->lr_length == UINT64_MAX) { + zfs_grow_blocksize(outzp, inblksz, tx); + + /* + * Block growth may fail for many reasons we can not + * predict here. If it happen the cloning is doomed. + */ + if (inblksz != outzp->z_blksz) { + error = SET_ERROR(EINVAL); + dmu_tx_abort(tx); + break; + } + + /* + * Round range lock up to the block boundary, so we + * prevent appends until we are done. + */ + zfs_rangelock_reduce(outlr, outoff, + ((len - 1) / inblksz + 1) * inblksz); + } + + error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, + bps, nbps); + if (error != 0) { + dmu_tx_commit(tx); + break; + } + + if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { + update_pages(outzp, outoff, size, outos); + } + + zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, + &clear_setid_bits_txg, tx); + + zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((outsize = outzp->z_size) < outoff + size) { + (void) atomic_cas_64(&outzp->z_size, outsize, + outoff + size); + } + + error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); + + zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, + size, inblksz, bps, nbps); + + dmu_tx_commit(tx); + + if (error != 0) + break; + + inoff += size; + outoff += size; + len -= size; + done += size; + + if (issig()) { + error = SET_ERROR(EINTR); + break; + } + } + + vmem_free(bps, sizeof (bps[0]) * maxblocks); + zfs_znode_update_vfs(outzp); + +unlock: + zfs_rangelock_exit(outlr); + zfs_rangelock_exit(inlr); + + if (done > 0) { + /* + * If we have made at least partial progress, reset the error. + */ + error = 0; + + ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); + + if (outos->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zilog, outzp->z_id); + } + + *inoffp += done; + *outoffp += done; + *lenp = done; + } else { + /* + * If we made no progress, there must be a good reason. + * EOF is handled explicitly above, before the loop. + */ + ASSERT3S(error, !=, 0); + } + + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + + return (error); +} + +/* + * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), + * but we cannot do that, because when replaying we don't have source znode + * available. This is why we need a dedicated replay function. + */ +int +zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, + const blkptr_t *bps, size_t nbps) +{ + zfsvfs_t *zfsvfs; + dmu_buf_impl_t *db; + dmu_tx_t *tx; + int error; + int count = 0; + sa_bulk_attr_t bulk[3]; + uint64_t mtime[2], ctime[2]; + + ASSERT3U(off, <, MAXOFFSET_T); + ASSERT3U(len, >, 0); + ASSERT3U(nbps, >, 0); + + zfsvfs = ZTOZSB(zp); + + ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), + SPA_FEATURE_BLOCK_CLONING)); + + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + + ASSERT(zfsvfs->z_replay); + ASSERT(!zfs_is_readonly(zfsvfs)); + + if ((off % blksz) != 0) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(EINVAL)); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + + /* + * Start a transaction. + */ + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + zfs_exit(zfsvfs, FTAG); + return (error); + } + + if (zp->z_blksz < blksz) + zfs_grow_blocksize(zp, blksz, tx); + + dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + if (zp->z_size < off + len) + zp->z_size = off + len; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + /* + * zil_replaying() not only check if we are replaying ZIL, but also + * updates the ZIL header to record replay progress. + */ + VERIFY(zil_replaying(zfsvfs->z_log, tx)); + + dmu_tx_commit(tx); + + zfs_znode_update_vfs(zp); + + zfs_exit(zfsvfs, FTAG); + + return (error); +} + EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_holey); @@ -928,6 +1548,14 @@ EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); +EXPORT_SYMBOL(zfs_clone_range); +EXPORT_SYMBOL(zfs_clone_range_replay); -ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, + "Enable block cloning"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, + "Wait for dirty blocks when cloning"); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 640e805d093a..34be54b337fd 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -43,6 +43,8 @@ #include <sys/metaslab.h> #include <sys/trace_zfs.h> #include <sys/abd.h> +#include <sys/brt.h> +#include <sys/wmsum.h> /* * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system @@ -89,12 +91,12 @@ * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ -int zfs_commit_timeout_pct = 5; +static uint_t zfs_commit_timeout_pct = 10; /* * See zil.h for more information about these fields. */ -zil_stats_t zil_stats = { +static zil_kstat_values_t zil_stats = { { "zil_commit_count", KSTAT_DATA_UINT64 }, { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, @@ -106,11 +108,16 @@ zil_stats_t zil_stats = { { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; -static kstat_t *zil_ksp; +static zil_sums_t zil_sums_global; +static kstat_t *zil_kstats_global; /* * Disable intent logging replay. This global ZIL switch affects all pools. @@ -118,25 +125,25 @@ static kstat_t *zil_ksp; int zil_replay_disable = 0; /* - * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to - * the disk(s) by the ZIL after an LWB write has completed. Setting this - * will cause ZIL corruption on power loss if a volatile out-of-order - * write cache is enabled. + * Disable the flush commands that are normally sent to the disk(s) by the ZIL + * after an LWB write has completed. Setting this will cause ZIL corruption on + * power loss if a volatile out-of-order write cache is enabled. */ -int zil_nocacheflush = 0; +static int zil_nocacheflush = 0; /* * Limit SLOG write size per commit executed with synchronous priority. * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ -unsigned long zil_slog_bulk = 768 * 1024; +static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; -#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ - sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) +static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); +static itx_t *zil_itx_clone(itx_t *oitx); +static uint64_t zil_max_waste_space(zilog_t *zilog); static int zil_bp_compare(const void *x1, const void *x2) @@ -213,16 +220,30 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } +static int +zil_kstats_global_update(kstat_t *ksp, int rw) +{ + zil_kstat_values_t *zs = ksp->ks_data; + ASSERT3P(&zil_stats, ==, zs); + + if (rw == KSTAT_WRITE) { + return (SET_ERROR(EACCES)); + } + + zil_kstat_values_update(zs, &zil_sums_global); + + return (0); +} + /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, - blkptr_t *nbp, void *dst, char **end) + blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -239,7 +260,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, - &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; @@ -254,39 +275,35 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, */ cksum.zc_word[ZIL_ZC_SEQ]++; + uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); - uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); - if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || + zilc->zc_nused < sizeof (*zilc) || + zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); - bcopy(lr, dst, len); - *end = (char *)dst + len; + *begin = lr; + *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; - uint64_t size = BP_GET_LSIZE(bp); + char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; - if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(zilc->zc_nused, <=, - SPA_OLD_MAXBLOCKSIZE); - bcopy(lr, dst, zilc->zc_nused); - *end = (char *)dst + zilc->zc_nused; + *begin = lr; + *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } - - arc_buf_destroy(abuf, &abuf); } return (error); @@ -298,7 +315,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; @@ -307,7 +324,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (BP_IS_HOLE(bp)) { if (wbuf != NULL) - bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); + memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } @@ -322,6 +339,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (wbuf == NULL) zio_flags |= ZIO_FLAG_RAW; + ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); @@ -330,13 +348,96 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) - bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); + memcpy(wbuf, abuf->b_data, arc_buf_size(abuf)); arc_buf_destroy(abuf, &abuf); } return (error); } +void +zil_sums_init(zil_sums_t *zs) +{ + wmsum_init(&zs->zil_commit_count, 0); + wmsum_init(&zs->zil_commit_writer_count, 0); + wmsum_init(&zs->zil_itx_count, 0); + wmsum_init(&zs->zil_itx_indirect_count, 0); + wmsum_init(&zs->zil_itx_indirect_bytes, 0); + wmsum_init(&zs->zil_itx_copied_count, 0); + wmsum_init(&zs->zil_itx_copied_bytes, 0); + wmsum_init(&zs->zil_itx_needcopy_count, 0); + wmsum_init(&zs->zil_itx_needcopy_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); +} + +void +zil_sums_fini(zil_sums_t *zs) +{ + wmsum_fini(&zs->zil_commit_count); + wmsum_fini(&zs->zil_commit_writer_count); + wmsum_fini(&zs->zil_itx_count); + wmsum_fini(&zs->zil_itx_indirect_count); + wmsum_fini(&zs->zil_itx_indirect_bytes); + wmsum_fini(&zs->zil_itx_copied_count); + wmsum_fini(&zs->zil_itx_copied_bytes); + wmsum_fini(&zs->zil_itx_needcopy_count); + wmsum_fini(&zs->zil_itx_needcopy_bytes); + wmsum_fini(&zs->zil_itx_metaslab_normal_count); + wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); + wmsum_fini(&zs->zil_itx_metaslab_normal_write); + wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); + wmsum_fini(&zs->zil_itx_metaslab_slog_count); + wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); + wmsum_fini(&zs->zil_itx_metaslab_slog_write); + wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); +} + +void +zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) +{ + zs->zil_commit_count.value.ui64 = + wmsum_value(&zil_sums->zil_commit_count); + zs->zil_commit_writer_count.value.ui64 = + wmsum_value(&zil_sums->zil_commit_writer_count); + zs->zil_itx_count.value.ui64 = + wmsum_value(&zil_sums->zil_itx_count); + zs->zil_itx_indirect_count.value.ui64 = + wmsum_value(&zil_sums->zil_itx_indirect_count); + zs->zil_itx_indirect_bytes.value.ui64 = + wmsum_value(&zil_sums->zil_itx_indirect_bytes); + zs->zil_itx_copied_count.value.ui64 = + wmsum_value(&zil_sums->zil_itx_copied_count); + zs->zil_itx_copied_bytes.value.ui64 = + wmsum_value(&zil_sums->zil_itx_copied_bytes); + zs->zil_itx_needcopy_count.value.ui64 = + wmsum_value(&zil_sums->zil_itx_needcopy_count); + zs->zil_itx_needcopy_bytes.value.ui64 = + wmsum_value(&zil_sums->zil_itx_needcopy_bytes); + zs->zil_itx_metaslab_normal_count.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); + zs->zil_itx_metaslab_normal_bytes.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); + zs->zil_itx_metaslab_normal_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); + zs->zil_itx_metaslab_normal_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); + zs->zil_itx_metaslab_slog_count.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); + zs->zil_itx_metaslab_slog_bytes.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); + zs->zil_itx_metaslab_slog_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); + zs->zil_itx_metaslab_slog_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); +} + /* * Parse the intent log, and call parse_func for each valid record within. */ @@ -353,12 +454,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; - blkptr_t blk, next_blk; - char *lrbuf, *lrp; + blkptr_t blk, next_blk = {{{{0}}}}; int error = 0; - bzero(&next_blk, sizeof (blkptr_t)); - /* * Old logs didn't record the maximum zh_claim_lr_seq. */ @@ -374,13 +472,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ - lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; - char *end = NULL; + char *lrp, *end; + arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; @@ -396,24 +494,42 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, - lrbuf, &end); - if (error != 0) + &lrp, &end, &abuf); + if (error != 0) { + if (abuf) + arc_buf_destroy(abuf, &abuf); + if (claimed) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + + dmu_objset_name(zilog->zl_os, name); + + cmn_err(CE_WARN, "ZFS read log block error %d, " + "dataset %s, seq 0x%llx\n", error, name, + (u_longlong_t)blk_seq); + } break; + } - for (lrp = lrbuf; lrp < end; lrp += reclen) { + for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); - if (lr->lrc_seq > claim_lr_seq) + ASSERT3U(reclen, <=, end - lrp); + if (lr->lrc_seq > claim_lr_seq) { + arc_buf_destroy(abuf, &abuf); goto done; + } error = parse_lr_func(zilog, lr, arg, txg); - if (error != 0) + if (error != 0) { + arc_buf_destroy(abuf, &abuf); goto done; + } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } + arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; @@ -422,21 +538,16 @@ done: zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; - ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || - (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) || - (decrypt && error == EIO)); - zil_bp_tree_fini(zilog); - zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } -/* ARGSUSED */ static int zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { + (void) tx; ASSERT(!BP_IS_HOLE(bp)); /* @@ -445,7 +556,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ - if (bp->blk_birth >= first_txg) + if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) @@ -455,11 +566,11 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, return (0); } -/* ARGSUSED */ static int zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { + (void) zilog, (void) lrc, (void) tx, (void) first_txg; return (0); } @@ -471,7 +582,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ - if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || + if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); @@ -481,14 +592,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, } static int -zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, - uint64_t first_txg) +zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; - if (lrc->lrc_txtype != TX_WRITE) - return (0); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If the block is not readable, don't claim it. This can happen @@ -498,7 +607,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ - if (lr->lr_blkptr.blk_birth >= first_txg) { + if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); @@ -507,35 +616,156 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } -/* ARGSUSED */ +static int +zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) +{ + const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; + const blkptr_t *bp; + spa_t *spa = zilog->zl_spa; + uint_t ii; + + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + + if (tx == NULL) { + return (0); + } + + /* + * XXX: Do we need to byteswap lr? + */ + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + + /* + * When data is embedded into the BP there is no need to create + * BRT entry as there is no data block. Just copy the BP as it + * contains the data. + */ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + continue; + + /* + * We can not handle block pointers from the future, since they + * are not yet allocated. It should not normally happen, but + * just in case lets be safe and just stop here now instead of + * corrupting the pool. + */ + if (BP_GET_BIRTH(bp) >= first_txg) + return (SET_ERROR(ENOENT)); + + /* + * Assert the block is really allocated before we reference it. + */ + metaslab_check_free(spa, bp); + } + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) + brt_pending_add(spa, bp, tx); + } + + return (0); +} + +static int +zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) +{ + + switch (lrc->lrc_txtype) { + case TX_WRITE: + return (zil_claim_write(zilog, lrc, tx, first_txg)); + case TX_CLONE_RANGE: + return (zil_claim_clone_range(zilog, lrc, tx, first_txg)); + default: + return (0); + } +} + static int zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t claim_txg) { + (void) claim_txg; + zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int -zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, - uint64_t claim_txg) +zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + /* * If we previously claimed it, we need to free it. */ - if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && - bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && - !BP_IS_HOLE(bp)) + if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg && + zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + } return (0); } static int +zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) +{ + const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; + const blkptr_t *bp; + spa_t *spa; + uint_t ii; + + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + + if (tx == NULL) { + return (0); + } + + spa = zilog->zl_spa; + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + + if (!BP_IS_HOLE(bp)) { + zio_free(spa, dmu_tx_get_txg(tx), bp); + } + } + + return (0); +} + +static int +zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t claim_txg) +{ + + if (claim_txg == 0) { + return (0); + } + + switch (lrc->lrc_txtype) { + case TX_WRITE: + return (zil_free_write(zilog, lrc, tx, claim_txg)); + case TX_CLONE_RANGE: + return (zil_free_clone_range(zilog, lrc, tx)); + default: + return (0); + } +} + +static int zil_lwb_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; @@ -544,41 +774,54 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) return (TREE_CMP(v1, v2)); } +/* + * Allocate a new lwb. We may already have a block pointer for it, in which + * case we get size and version from there. Or we may not yet, in which case + * we choose them here and later make the block allocation match. + */ static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, - boolean_t fastwrite) +zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, + uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; - lwb->lwb_blk = *bp; - lwb->lwb_fastwrite = fastwrite; + if (bp) { + lwb->lwb_blk = *bp; + lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); + sz = BP_GET_LSIZE(bp); + } else { + BP_ZERO(&lwb->lwb_blk); + lwb->lwb_slim = (spa_version(zilog->zl_spa) >= + SPA_VERSION_SLIM_ZIL); + } lwb->lwb_slog = slog; - lwb->lwb_state = LWB_STATE_CLOSED; - lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); - lwb->lwb_max_txg = txg; + lwb->lwb_error = 0; + if (lwb->lwb_slim) { + lwb->lwb_nmax = sz; + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + } else { + lwb->lwb_nmax = sz - sizeof (zil_chain_t); + lwb->lwb_nused = lwb->lwb_nfilled = 0; + } + lwb->lwb_sz = sz; + lwb->lwb_state = state; + lwb->lwb_buf = zio_buf_alloc(sz); + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; - lwb->lwb_tx = NULL; lwb->lwb_issued_timestamp = 0; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); - } else { - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); - } + lwb->lwb_issued_txg = 0; + lwb->lwb_alloc_txg = txg; + lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); + if (state != LWB_STATE_NEW) + zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - VERIFY(list_is_empty(&lwb->lwb_itxs)); - return (lwb); } @@ -586,15 +829,17 @@ static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - VERIFY(list_is_empty(&lwb->lwb_itxs)); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_FLUSH_DONE); + ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); - ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || - lwb->lwb_state == LWB_STATE_FLUSH_DONE); + VERIFY(list_is_empty(&lwb->lwb_itxs)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); /* * Clear the zilog's field to indicate this lwb is no longer @@ -663,6 +908,36 @@ zilog_is_dirty(zilog_t *zilog) } /* + * Its called in zil_commit context (zil_process_commit_list()/zil_create()). + * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled. + * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every + * zil_commit. + */ +static void +zil_commit_activate_saxattr_feature(zilog_t *zilog) +{ + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + uint64_t txg = 0; + dmu_tx_t *tx = NULL; + + if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && + dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && + !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { + tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_dataset_dirty(ds, tx); + txg = dmu_tx_get_txg(tx); + + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } +} + +/* * Create an on-disk intent log. */ static lwb_t * @@ -674,8 +949,9 @@ zil_create(zilog_t *zilog) dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; - boolean_t fastwrite = FALSE; boolean_t slog = FALSE; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + /* * Wait for any previous destroy to complete. @@ -705,8 +981,6 @@ zil_create(zilog_t *zilog) error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); - fastwrite = TRUE; - if (error == 0) zil_init_log_chain(zilog, &blk); } @@ -715,7 +989,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); + lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction @@ -723,11 +997,35 @@ zil_create(zilog_t *zilog) * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { + /* + * If "zilsaxattr" feature is enabled on zpool, then activate + * it now when we're creating the ZIL chain. We can't wait with + * this until we write the first xattr log record because we + * need to wait for the feature activation to sync out. + */ + if (spa_feature_is_enabled(zilog->zl_spa, + SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != + DMU_OST_ZVOL) { + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); + } else { + /* + * This branch covers the case where we enable the feature on a + * zpool that has existing ZIL headers. + */ + zil_commit_activate_saxattr_feature(zilog); } + IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && + dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL, + dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)); - ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); + ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); IMPLY(error == 0, lwb != NULL); return (lwb); @@ -741,8 +1039,9 @@ zil_create(zilog_t *zilog) * txg_wait_synced() here either when keep_first is set, because both * zil_create() and zil_destroy() will wait for any in-progress destroys * to complete. + * Return B_TRUE if there were any entries to replay. */ -void +boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; @@ -758,7 +1057,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) - return; + return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); @@ -774,15 +1073,11 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, - &lwb->lwb_blk); - - list_remove(&zilog->zl_lwb_list, lwb); + while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { @@ -791,6 +1086,8 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); + + return (B_TRUE); } void @@ -911,10 +1208,10 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ -/* ARGSUSED */ int zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) { + (void) dp; zilog_t *zilog; objset_t *os; blkptr_t *bp; @@ -1008,21 +1305,21 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's - * zl_lock, thus it must be held when calling this function. + * zl_issuer_lock while the lwb is open and zl_lock otherwise. + * zl_issuer_lock also protects leaving the open state. + * zcw_lwb setting is protected by zl_issuer_lock and state != + * flush_done, which transition is protected by zl_lock. */ - ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); + IMPLY(lwb->lwb_state != LWB_STATE_OPENED, + MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); - ASSERT3P(lwb, !=, NULL); - ASSERT(lwb->lwb_state == LWB_STATE_OPENED || - lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE); - list_insert_tail(&lwb->lwb_waiters, zcw); + ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; - mutex_exit(&zcw->zcw_lock); } /* @@ -1033,11 +1330,9 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { - mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); list_insert_tail(nolwb, zcw); - mutex_exit(&zcw->zcw_lock); + ASSERT3P(zcw->zcw_lwb, ==, NULL); } void @@ -1049,6 +1344,9 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) int ndvas = BP_GET_NDVAS(bp); int i; + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + if (zil_nocacheflush) return; @@ -1107,48 +1405,33 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) } /* - * This function is a called after all vdevs associated with a given lwb - * write have completed their DKIOCFLUSHWRITECACHE command; or as soon - * as the lwb write completes, if "zil_nocacheflush" is set. Further, - * all "previous" lwb's will have completed before this function is - * called; i.e. this function is called for all previous lwbs before - * it's called for "this" lwb (enforced via zio the dependencies - * configured in zil_lwb_set_zio_dependency()). + * This function is a called after all vdevs associated with a given lwb write + * have completed their flush command; or as soon as the lwb write completes, + * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have + * completed before this function is called; i.e. this function is called for + * all previous lwbs before it's called for "this" lwb (enforced via zio the + * dependencies configured in zil_lwb_set_zio_dependency()). * - * The intention is for this function to be called as soon as the - * contents of an lwb are considered "stable" on disk, and will survive - * any sudden loss of power. At this point, any threads waiting for the - * lwb to reach this state are signalled, and the "waiter" structures - * are marked "done". + * The intention is for this function to be called as soon as the contents of + * an lwb are considered "stable" on disk, and will survive any sudden loss of + * power. At this point, any threads waiting for the lwb to reach this state + * are signalled, and the "waiter" structures are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; - dmu_tx_t *tx = lwb->lwb_tx; zil_commit_waiter_t *zcw; itx_t *itx; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); - /* - * Ensure the lwb buffer pointer is cleared before releasing the - * txg. If we have had an allocation failure and the txg is - * waiting to sync then we want zil_sync() to remove the lwb so - * that it's not picked up as the next new one in - * zil_process_commit_list(). zil_sync() will only remove the - * lwb if lwb_buf is null. - */ - lwb->lwb_buf = NULL; - lwb->lwb_tx = NULL; - - ASSERT3U(lwb->lwb_issued_timestamp, >, 0); - zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; + zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8; lwb->lwb_root_zio = NULL; @@ -1165,17 +1448,12 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } - while ((itx = list_head(&lwb->lwb_itxs)) != NULL) { - list_remove(&lwb->lwb_itxs, itx); + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); - } - while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { + while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); - ASSERT(list_link_active(&zcw->zcw_node)); - list_remove(&lwb->lwb_waiters, zcw); - ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* @@ -1202,28 +1480,65 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } + uint64_t txg = lwb->lwb_issued_txg; + + /* Once we drop the lock, lwb may be freed by zil_sync(). */ mutex_exit(&zilog->zl_lock); - /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. - */ - dmu_tx_commit(tx); + mutex_enter(&zilog->zl_lwb_io_lock); + ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); + zilog->zl_lwb_inflight[txg & TXG_MASK]--; + if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) + cv_broadcast(&zilog->zl_lwb_io_cv); + mutex_exit(&zilog->zl_lwb_io_lock); } /* - * This is called when an lwb's write zio completes. The callback's - * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs - * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved - * in writing out this specific lwb's data, and in the case that cache - * flushes have been deferred, vdevs involved in writing the data for - * previous lwbs. The writes corresponding to all the vdevs in the - * lwb_vdev_tree will have completed by the time this is called, due to - * the zio dependencies configured in zil_lwb_set_zio_dependency(), - * which takes deferred flushes into account. The lwb will be "done" - * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio - * completion callback for the lwb's root zio. + * Wait for the completion of all issued write/flush of that txg provided. + * It guarantees zil_lwb_flush_vdevs_done() is called and returned. + */ +static void +zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) +{ + ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa)); + + mutex_enter(&zilog->zl_lwb_io_lock); + while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0) + cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock); + mutex_exit(&zilog->zl_lwb_io_lock); + +#ifdef ZFS_DEBUG + mutex_enter(&zilog->zl_lock); + mutex_enter(&zilog->zl_lwb_io_lock); + lwb_t *lwb = list_head(&zilog->zl_lwb_list); + while (lwb != NULL) { + if (lwb->lwb_issued_txg <= txg) { + ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); + ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); + IMPLY(lwb->lwb_issued_txg > 0, + lwb->lwb_state == LWB_STATE_FLUSH_DONE); + } + IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE || + lwb->lwb_state == LWB_STATE_FLUSH_DONE, + lwb->lwb_buf == NULL); + lwb = list_next(&zilog->zl_lwb_list, lwb); + } + mutex_exit(&zilog->zl_lwb_io_lock); + mutex_exit(&zilog->zl_lock); +#endif +} + +/* + * This is called when an lwb's write zio completes. The callback's purpose is + * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The + * tree will contain the vdevs involved in writing out this specific lwb's + * data, and in the case that cache flushes have been deferred, vdevs involved + * in writing the data for previous lwbs. The writes corresponding to all the + * vdevs in the lwb_vdev_tree will have completed by the time this is called, + * due to the zio dependencies configured in zil_lwb_set_zio_dependency(), + * which takes deferred flushes into account. The lwb will be "done" once + * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion + * callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) @@ -1238,22 +1553,25 @@ zil_lwb_write_done(zio_t *zio) ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); - ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); - ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); - ASSERT(!BP_IS_GANG(zio->io_bp)); - ASSERT(!BP_IS_HOLE(zio->io_bp)); - ASSERT(BP_GET_FILL(zio->io_bp) == 0); - abd_free(zio->io_abd); + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + lwb->lwb_buf = NULL; mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; + lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; - lwb->lwb_fastwrite = FALSE; + + /* + * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not + * called for it yet, and when it will be, it won't be able to make + * its write ZIO a parent this ZIO. In such case we can not defer + * our flushes or below may be a race between the done callbacks. + */ nlwb = list_next(&zilog->zl_lwb_list, lwb); + if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) + nlwb = NULL; mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) @@ -1279,21 +1597,20 @@ zil_lwb_write_done(zio_t *zio) } /* - * If this lwb does not have any threads waiting for it to - * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE - * command to the vdevs written to by "this" lwb, and instead - * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE - * command for those vdevs. Thus, we merge the vdev tree of - * "this" lwb with the vdev tree of the "next" lwb in the list, - * and assume the "next" lwb will handle flushing the vdevs (or - * deferring the flush(s) again). + * If this lwb does not have any threads waiting for it to complete, we + * want to defer issuing the flush command to the vdevs written to by + * "this" lwb, and instead rely on the "next" lwb to handle the flush + * command for those vdevs. Thus, we merge the vdev tree of "this" lwb + * with the vdev tree of the "next" lwb in the list, and assume the + * "next" lwb will handle flushing the vdevs (or deferring the flush(s) + * again). * - * This is a useful performance optimization, especially for - * workloads with lots of async write activity and few sync - * write and/or fsync activity, as it has the potential to - * coalesce multiple flush commands to a vdev into one. + * This is a useful performance optimization, especially for workloads + * with lots of async write activity and few sync write and/or fsync + * activity, as it has the potential to coalesce multiple flush + * commands to a vdev into one. */ - if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) { + if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); return; @@ -1316,338 +1633,487 @@ zil_lwb_write_done(zio_t *zio) } } +/* + * Build the zio dependency chain, which is used to preserve the ordering of + * lwb completions that is required by the semantics of the ZIL. Each new lwb + * zio becomes a parent of the previous lwb zio, such that the new lwb's zio + * cannot complete until the previous lwb's zio completes. + * + * This is required by the semantics of zil_commit(): the commit waiters + * attached to the lwbs will be woken in the lwb zio's completion callback, + * so this zio dependency graph ensures the waiters are woken in the correct + * order (the same order the lwbs were created). + */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { - lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; - - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zilog->zl_lock)); + lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); + if (prev_lwb == NULL || + prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) + return; + /* - * The zilog's "zl_last_lwb_opened" field is used to build the - * lwb/zio dependency chain, which is used to preserve the - * ordering of lwb completions that is required by the semantics - * of the ZIL. Each new lwb zio becomes a parent of the - * "previous" lwb zio, such that the new lwb's zio cannot - * complete until the "previous" lwb's zio completes. + * If the previous lwb's write hasn't already completed, we also want + * to order the completion of the lwb write zios (above, we only order + * the completion of the lwb root zios). This is required because of + * how we can defer the flush commands for each lwb. * - * This is required by the semantics of zil_commit(); the commit - * waiters attached to the lwbs will be woken in the lwb zio's - * completion callback, so this zio dependency graph ensures the - * waiters are woken in the correct order (the same order the - * lwbs were created). - */ - if (last_lwb_opened != NULL && - last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED || - last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); - - ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); - zio_add_child(lwb->lwb_root_zio, - last_lwb_opened->lwb_root_zio); - - /* - * If the previous lwb's write hasn't already completed, - * we also want to order the completion of the lwb write - * zios (above, we only order the completion of the lwb - * root zios). This is required because of how we can - * defer the DKIOCFLUSHWRITECACHE commands for each lwb. - * - * When the DKIOCFLUSHWRITECACHE commands are deferred, - * the previous lwb will rely on this lwb to flush the - * vdevs written to by that previous lwb. Thus, we need - * to ensure this lwb doesn't issue the flush until - * after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child - * relationship here. - * - * Without this relationship on the lwb's write zio, - * it's possible for this lwb's write to complete prior - * to the previous lwb's write completing; and thus, the - * vdevs for the previous lwb would be flushed prior to - * that lwb's data being written to those vdevs (the - * vdevs are flushed in the lwb write zio's completion - * handler, zil_lwb_write_done()). - */ - if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { - ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || - last_lwb_opened->lwb_state == LWB_STATE_ISSUED); - - ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); - zio_add_child(lwb->lwb_write_zio, - last_lwb_opened->lwb_write_zio); - } + * When the flush commands are deferred, the previous lwb will rely on + * this lwb to flush the vdevs written to by that previous lwb. Thus, + * we need to ensure this lwb doesn't issue the flush until after the + * previous lwb's write completes. We ensure this ordering by setting + * the zio parent/child relationship here. + * + * Without this relationship on the lwb's write zio, it's possible for + * this lwb's write to complete prior to the previous lwb's write + * completing; and thus, the vdevs for the previous lwb would be + * flushed prior to that lwb's data being written to those vdevs (the + * vdevs are flushed in the lwb write zio's completion handler, + * zil_lwb_write_done()). + */ + if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { + ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); + zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); + } else { + ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } + + ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); + zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to - * accept new itxs being committed to it. To do this, the lwb's zio - * structures are created, and linked to the lwb. This function is - * idempotent; if the passed in lwb has already been opened, this - * function is essentially a no-op. + * accept new itxs being committed to it. This function is idempotent; if + * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { - zbookmark_phys_t zb; - zio_priority_t prio; - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb, !=, NULL); - EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); - EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); - SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, - lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); + if (lwb->lwb_state != LWB_STATE_NEW) { + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + return; + } - /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); - if (lwb->lwb_root_zio == NULL) { - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); + lwb->lwb_state = LWB_STATE_OPENED; + zilog->zl_last_lwb_opened = lwb; + mutex_exit(&zilog->zl_lock); +} - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; - } +/* + * Maximum block size used by the ZIL. This is picked up when the ZIL is + * initialized. Otherwise this should not be used directly; see + * zl_max_block_size instead. + */ +static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; +/* + * Plan splitting of the provided burst size between several blocks. + */ +static uint_t +zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) +{ + uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t); - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); + if (size <= md) { + /* + * Small bursts are written as-is in one block. + */ + *minsize = size; + return (size); + } else if (size > 8 * md) { + /* + * Big bursts use maximum blocks. The first block size + * is hard to predict, but it does not really matter. + */ + *minsize = 0; + return (md); + } - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, - zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, - BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); + /* + * Medium bursts try to divide evenly to better utilize several SLOG + * VDEVs. The first block size we predict assuming the worst case of + * maxing out others. Fall back to using maximum blocks if due to + * large records or wasted space we can not predict anything better. + */ + uint_t s = size; + uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t)); + uint_t chunk = DIV_ROUND_UP(s, n); + uint_t waste = zil_max_waste_space(zilog); + waste = MAX(waste, zilog->zl_cur_max); + if (chunk <= md - waste) { + *minsize = MAX(s - (md - waste) * (n - 1), waste); + return (chunk); + } else { + *minsize = 0; + return (md); + } +} - lwb->lwb_state = LWB_STATE_OPENED; +/* + * Try to predict next block size based on previous history. Make prediction + * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is + * less then 50%, extra writes may cost more, but we don't want single spike + * to badly affect our predictions. + */ +static uint_t +zil_lwb_predict(zilog_t *zilog) +{ + uint_t m, o; - zil_lwb_set_zio_dependency(zilog, lwb); - zilog->zl_last_lwb_opened = lwb; + /* If we are in the middle of a burst, take it into account also. */ + if (zilog->zl_cur_size > 0) { + o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m); + } else { + o = UINT_MAX; + m = 0; } - mutex_exit(&zilog->zl_lock); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + /* Find minimum optimal size. We don't need to go below that. */ + for (int i = 0; i < ZIL_BURSTS; i++) + o = MIN(o, zilog->zl_prev_opt[i]); + + /* Find two biggest minimal first block sizes above the optimal. */ + uint_t m1 = MAX(m, o), m2 = o; + for (int i = 0; i < ZIL_BURSTS; i++) { + m = zilog->zl_prev_min[i]; + if (m >= m1) { + m2 = m1; + m1 = m; + } else if (m > m2) { + m2 = m; + } + } + + /* + * If second minimum size gives 50% saving -- use it. It may cost us + * one additional write later, but the space saving is just too big. + */ + return ((m1 < m2 * 2) ? m1 : m2); } /* - * Define a limited set of intent log block sizes. - * - * These must be a multiple of 4KB. Note only the amount used (again - * aligned to 4KB) actually gets written. However, we can't always just - * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. + * Close the log block for being issued and allocate the next one. + * Has to be called under zl_issuer_lock to chain more lwbs. */ -struct { - uint64_t limit; - uint64_t blksz; -} zil_block_buckets[] = { - { 4096, 4096 }, /* non TX_WRITE */ - { 8192 + 4096, 8192 + 4096 }, /* database */ - { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ - { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ - { 131072, 131072 }, /* < 128KB writes */ - { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ - { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ -}; +static lwb_t * +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) +{ + uint64_t blksz, plan, plan2; -/* - * Maximum block size used by the ZIL. This is picked up when the ZIL is - * initialized. Otherwise this should not be used directly; see - * zl_max_block_size instead. - */ -int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + lwb->lwb_state = LWB_STATE_CLOSED; + + /* + * If there was an allocation failure then returned NULL will trigger + * zil_commit_writer_stall() at the caller. This is inherently racy, + * since allocation may not have happened yet. + */ + if (lwb->lwb_error != 0) + return (NULL); + + /* + * Log blocks are pre-allocated. Here we select the size of the next + * block, based on what's left of this burst and the previous history. + * While we try to only write used part of the block, we can't just + * always allocate the maximum block size because we can exhaust all + * available pool log space, so we try to be reasonable. + */ + if (zilog->zl_cur_left > 0) { + /* + * We are in the middle of a burst and know how much is left. + * But if workload is multi-threaded there may be more soon. + * Try to predict what can it be and plan for the worst case. + */ + uint_t m; + plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + if (zilog->zl_parallel) { + plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left + + zil_lwb_predict(zilog), &m); + if (plan < plan2) + plan = plan2; + } + } else { + /* + * The previous burst is done and we can only predict what + * will come next. + */ + plan = zil_lwb_predict(zilog); + } + blksz = plan + sizeof (zil_chain_t); + blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t); + blksz = MIN(blksz, zilog->zl_max_block_size); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz, + uint64_t, plan); + + return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state)); +} /* - * Start a log block write and advance to the next log block. - * Calls are serialized. + * Finalize previously closed block and issue the write zio. */ -static lwb_t * +static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { - lwb_t *nlwb = NULL; - zil_chain_t *zilc; spa_t *spa = zilog->zl_spa; - blkptr_t *bp; - dmu_tx_t *tx; - uint64_t txg; - uint64_t zil_blksz, wsz; - int i, error; + zil_chain_t *zilc; boolean_t slog; + zbookmark_phys_t zb; + zio_priority_t prio; + int error; - ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - zilc = (zil_chain_t *)lwb->lwb_buf; - bp = &zilc->zc_next_blk; - } else { - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; - } + /* Actually fill the lwb with the data. */ + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); - ASSERT(lwb->lwb_nused <= lwb->lwb_sz); + lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, + ZIO_FLAG_CANFAIL); /* - * Allocate the next block and save its address in this block - * before writing it in order to establish the log chain. - * Note that if the allocation of nlwb synced before we wrote - * the block that points at it (lwb), we'd leak it if we crashed. - * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). - * We dirty the dataset to ensure that zil_sync() will be called - * to clean up in the event of allocation failure or I/O failure. + * The lwb is now ready to be issued, but it can be only if it already + * got its block pointer allocated or the allocation has failed. + * Otherwise leave it as-is, relying on some other thread to issue it + * after allocating its block pointer via calling zil_lwb_write_issue() + * for the previous lwb(s) in the chain. */ + mutex_enter(&zilog->zl_lock); + lwb->lwb_state = LWB_STATE_READY; + if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { + mutex_exit(&zilog->zl_lock); + return; + } + mutex_exit(&zilog->zl_lock); - tx = dmu_tx_create(zilog->zl_os); +next_lwb: + if (lwb->lwb_slim) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); + int wsz = lwb->lwb_sz; + if (lwb->lwb_error == 0) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); + if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, + &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, + lwb, prio, ZIO_FLAG_CANFAIL, &zb); + zil_lwb_add_block(lwb, &lwb->lwb_blk); + + if (lwb->lwb_slim) { + /* For Slim ZIL only write what is used. */ + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, + int); + ASSERT3S(wsz, <=, lwb->lwb_sz); + zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; + } + memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + zilc->zc_pad = 0; + zilc->zc_nused = lwb->lwb_nused; + zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + } else { + /* + * We can't write the lwb if there was an allocation failure, + * so create a null zio instead just to maintain dependencies. + */ + lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, + zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); + lwb->lwb_write_zio->io_error = lwb->lwb_error; + } + if (lwb->lwb_child_zio) + zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); /* - * Since we are not going to create any new dirty data, and we - * can even help with clearing the existing dirty data, we - * should not be subject to the dirty data based delays. We - * use TXG_NOTHROTTLE to bypass the delay mechanism. + * Open transaction to allocate the next block pointer. */ + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - txg = dmu_tx_get_txg(tx); - - lwb->lwb_tx = tx; + uint64_t txg = dmu_tx_get_txg(tx); /* - * Log blocks are pre-allocated. Here we select the size of the next - * block, based on size used in the last block. - * - first find the smallest bucket that will fit the block from a - * limited set of block sizes. This is because it's faster to write - * blocks allocated from the same metaslab as they are adjacent or - * close. - * - next find the maximum from the new suggested size and an array of - * previous sizes. This lessens a picket fence effect of wrongly - * guessing the size if we have a stream of say 2k, 64k, 2k, 64k - * requests. - * - * Note we only write what is used, but we can't just allocate - * the maximum block size because we can exhaust the available - * pool log space. - */ - zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); - for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) - continue; - zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); - zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; - for (i = 0; i < ZIL_PREV_BLKS; i++) - zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); - zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - + * Allocate next the block pointer unless we are already in error. + */ + lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); + blkptr_t *bp = &zilc->zc_next_blk; BP_ZERO(bp); - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (slog) { - ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); - } else { - ZIL_STAT_BUMP(zil_itx_metaslab_normal_count); - ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused); + error = lwb->lwb_error; + if (error == 0) { + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, + &slog); } if (error == 0) { - ASSERT3U(bp->blk_birth, ==, txg); + ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg); + BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : + ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; - - /* - * Allocate a new log write block (lwb). - */ - nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); } - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); - ASSERT3U(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); - - } else { - wsz = lwb->lwb_sz; - } + /* + * Reduce TXG open time by incrementing inflight counter and committing + * the transaciton. zil_sync() will wait for it to return to zero. + */ + mutex_enter(&zilog->zl_lwb_io_lock); + lwb->lwb_issued_txg = txg; + zilog->zl_lwb_inflight[txg & TXG_MASK]++; + zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); + mutex_exit(&zilog->zl_lwb_io_lock); + dmu_tx_commit(tx); - zilc->zc_pad = 0; - zilc->zc_nused = lwb->lwb_nused; - zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + spa_config_enter(spa, SCL_STATE, lwb, RW_READER); /* - * clear unused data for security + * We've completed all potentially blocking operations. Update the + * nlwb and allow it proceed without possible lock order reversals. */ - bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); + mutex_enter(&zilog->zl_lock); + zil_lwb_set_zio_dependency(zilog, lwb); + lwb->lwb_state = LWB_STATE_ISSUED; - spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); + if (nlwb) { + nlwb->lwb_blk = *bp; + nlwb->lwb_error = error; + nlwb->lwb_slog = slog; + nlwb->lwb_alloc_txg = txg; + if (nlwb->lwb_state != LWB_STATE_READY) + nlwb = NULL; + } + mutex_exit(&zilog->zl_lock); - zil_lwb_add_block(lwb, &lwb->lwb_blk); + if (lwb->lwb_slog) { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, + lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); + } else { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, + lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); + } lwb->lwb_issued_timestamp = gethrtime(); - lwb->lwb_state = LWB_STATE_ISSUED; - - zio_nowait(lwb->lwb_root_zio); + if (lwb->lwb_child_zio) + zio_nowait(lwb->lwb_child_zio); zio_nowait(lwb->lwb_write_zio); + zio_nowait(lwb->lwb_root_zio); /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). + * If nlwb was ready when we gave it the block pointer, + * it is on us to issue it and possibly following ones. */ - return (nlwb); + lwb = nlwb; + if (lwb) + goto next_lwb; } /* - * Maximum amount of write data that can be put into single log block. + * Maximum amount of data that can be put into single log block. */ uint64_t -zil_max_log_data(zilog_t *zilog) +zil_max_log_data(zilog_t *zilog, size_t hdrsize) { - return (zilog->zl_max_block_size - - sizeof (zil_chain_t) - sizeof (lr_write_t)); + return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize); } /* * Maximum amount of log space we agree to waste to reduce number of - * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%). */ static inline uint64_t zil_max_waste_space(zilog_t *zilog) { - return (zil_max_log_data(zilog) / 8); + return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16); } /* * Maximum amount of write data for WR_COPIED. For correctness, consumers * must fall back to WR_NEED_COPY if we can't fit the entire record into one * maximum sized log block, because each WR_COPIED record must fit in a - * single log block. For space efficiency, we want to fit two records into a - * max-sized log block. + * single log block. Below that it is a tradeoff of additional memory copy + * and possibly worse log space efficiency vs additional range lock/unlock. */ +static uint_t zil_maxcopied = 7680; + uint64_t zil_max_copied_data(zilog_t *zilog) { - return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - - sizeof (lr_write_t)); + uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t)); + return (MIN(max_data, zil_maxcopied)); } +static uint64_t +zil_itx_record_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen); +} + +static uint64_t +zil_itx_data_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + lr_write_t *lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t)); + return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t), + uint64_t)); + } + return (0); +} + +static uint64_t +zil_itx_full_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen + zil_itx_data_size(itx)); +} + +/* + * Estimate space needed in the lwb for the itx. Allocate more lwbs or + * split the itx as needed, but don't touch the actual transaction data. + * Has to be called under zl_issuer_lock to call zil_lwb_write_close() + * to chain more lwbs. + */ static lwb_t * -zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { - lr_t *lrcb, *lrc; - lr_write_t *lrwb, *lrw; - char *lr_buf; - uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; + itx_t *citx; + lr_t *lr, *clr; + lr_write_t *lrw; + uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -1655,8 +2121,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) zil_lwb_write_open(zilog, lwb); - lrc = &itx->itx_lr; - lrw = (lr_write_t *)lrc; + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead @@ -1670,150 +2136,207 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * * For more details, see the comment above zil_commit(). */ - if (lrc->lrc_txtype == TX_COMMIT) { - mutex_enter(&zilog->zl_lock); + if (lr->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_lwb(itx->itx_private, lwb); - itx->itx_private = NULL; - mutex_exit(&zilog->zl_lock); + list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - dpad = dlen - lrw->lr_length; - } else { - dlen = dpad = 0; - } - reclen = lrc->lrc_reclen; - zilog->zl_cur_used += (reclen + dlen); - txg = lrc->lrc_txg; - - ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); + reclen = lr->lrc_reclen; + ASSERT3U(reclen, >=, sizeof (lr_t)); + ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0)); + dlen = zil_itx_data_size(itx); cont: /* * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; - max_log_data = zil_max_log_data(zilog); + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; + max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); - zil_lwb_write_open(zilog, lwb); - ASSERT(LWB_EMPTY(lwb)); - lwb_sp = lwb->lwb_sz - lwb->lwb_nused; - - /* - * There must be enough space in the new, empty log block to - * hold reclen. For WR_COPIED, we need to fit the whole - * record in one block, and reclen is the header size + the - * data size. For WR_NEED_COPY, we can create multiple - * records, splitting the data into multiple blocks, so we - * only need to fit one word of data per block; in this case - * reclen is just the header size (no data). - */ - ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; } + /* + * There must be enough space in the log block to hold reclen. + * For WR_COPIED, we need to fit the whole record in one block, + * and reclen is the write record header size + the data size. + * For WR_NEED_COPY, we can create multiple records, splitting + * the data into multiple blocks, so we only need to fit one + * word of data per block; in this case reclen is just the header + * size (no data). + */ + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + dnow = MIN(dlen, lwb_sp - reclen); - lr_buf = lwb->lwb_buf + lwb->lwb_nused; - bcopy(lrc, lr_buf, reclen); - lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ - lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ + if (dlen > dnow) { + ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); + ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); + citx = zil_itx_clone(itx); + clr = &citx->itx_lr; + lr_write_t *clrw = (lr_write_t *)clr; + clrw->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; + zilog->zl_cur_left -= dnow; + } else { + citx = itx; + clr = lr; + } + + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + clr->lrc_seq = ++zilog->zl_lr_seq; + + lwb->lwb_nused += reclen + dnow; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + zil_lwb_add_txg(lwb, lr->lrc_txg); + list_insert_tail(&lwb->lwb_itxs, citx); + + dlen -= dnow; + if (dlen > 0) + goto cont; + + if (lr->lrc_txtype == TX_WRITE && + lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); + + return (lwb); +} + +/* + * Fill the actual transaction data into the lwb, following zil_lwb_assign(). + * Does not require locking. + */ +static void +zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) +{ + lr_t *lr, *lrb; + lr_write_t *lrw, *lrwb; + char *lr_buf; + uint64_t dlen, reclen; + + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; - ZIL_STAT_BUMP(zil_itx_count); + if (lr->lrc_txtype == TX_COMMIT) + return; + + reclen = lr->lrc_reclen; + dlen = zil_itx_data_size(itx); + ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); + + lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; + memcpy(lr_buf, lr, reclen); + lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ + lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ + + ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ - if (lrc->lrc_txtype == TX_WRITE) { - if (txg > spa_freeze_txg(zilog->zl_spa)) - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { - ZIL_STAT_BUMP(zil_itx_copied_count); - ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length); + ZIL_STAT_BUMP(zilog, zil_itx_copied_count); + ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, + lrw->lr_length); } else { char *dbuf; int error; if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrcb->lrc_reclen += dnow; - if (lrwb->lr_length > dnow) - lrwb->lr_length = dnow; - lrw->lr_offset += dnow; - lrw->lr_length -= dnow; - ZIL_STAT_BUMP(zil_itx_needcopy_count); - ZIL_STAT_INCR(zil_itx_needcopy_bytes, dnow); + lrb->lrc_reclen += dlen; + ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); + ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, + dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; - ZIL_STAT_BUMP(zil_itx_indirect_count); - ZIL_STAT_INCR(zil_itx_indirect_bytes, + ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); + ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); + if (lwb->lwb_child_zio == NULL) { + lwb->lwb_child_zio = zio_null(NULL, + zilog->zl_spa, NULL, NULL, NULL, + ZIO_FLAG_CANFAIL); + } } /* - * We pass in the "lwb_write_zio" rather than - * "lwb_root_zio" so that the "lwb_write_zio" - * becomes the parent of any zio's created by - * the "zl_get_data" callback. The vdevs are - * flushed after the "lwb_write_zio" completes, - * so we want to make sure that completion - * callback waits for these additional zio's, - * such that the vdevs used by those zio's will - * be included in the lwb's vdev tree, and those - * vdevs will be properly flushed. If we passed - * in "lwb_root_zio" here, then these additional - * vdevs may not be flushed; e.g. if these zio's - * completed after "lwb_write_zio" completed. + * The "lwb_child_zio" we pass in will become a child of + * "lwb_write_zio", when one is created, so one will be + * a parent of any zio's created by the "zl_get_data". + * This way "lwb_write_zio" will first wait for children + * block pointers before own writing, and then for their + * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, - lwb->lwb_write_zio); - if (dbuf != NULL && error == 0 && dnow == dlen) + lwb->lwb_child_zio); + if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ - bzero((char *)dbuf + lrwb->lr_length, dpad); - - if (error == EIO) { - txg_wait_synced(zilog->zl_dmu_pool, txg); - return (lwb); + memset((char *)dbuf + lrwb->lr_length, 0, + dlen - lrwb->lr_length); } - if (error != 0) { - ASSERT(error == ENOENT || error == EEXIST || - error == EALREADY); - return (lwb); + + /* + * Typically, the only return values we should see from + * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or + * EALREADY. However, it is also possible to see other + * error values such as ENOSPC or EINVAL from + * dmu_read() -> dnode_hold() -> dnode_hold_impl() or + * ENXIO as well as a multitude of others from the + * block layer through dmu_buf_hold() -> dbuf_read() + * -> zio_wait(), as well as through dmu_read() -> + * dnode_hold() -> dnode_hold_impl() -> dbuf_read() -> + * zio_wait(). When these errors happen, we can assume + * that neither an immediate write nor an indirect + * write occurred, so we need to fall back to + * txg_wait_synced(). This is unusual, so we print to + * dmesg whenever one of these errors occurs. + */ + switch (error) { + case 0: + break; + default: + cmn_err(CE_WARN, "zil_lwb_commit() received " + "unexpected error %d from ->zl_get_data()" + ". Falling back to txg_wait_synced().", + error); + zfs_fallthrough; + case EIO: + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); + zfs_fallthrough; + case ENOENT: + zfs_fallthrough; + case EEXIST: + zfs_fallthrough; + case EALREADY: + return; } } } - /* - * We're actually making an entry, so update lrc_seq to be the - * log record sequence number. Note that this is generally not - * equal to the itx sequence number because not all transactions - * are synchronous, and sometimes spa_sync() gets there first. - */ - lrcb->lrc_seq = ++zilog->zl_lr_seq; - lwb->lwb_nused += reclen + dnow; - - zil_lwb_add_txg(lwb, txg); - - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); - ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); - - dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; - goto cont; - } - - return (lwb); + lwb->lwb_nfilled += reclen + dlen; + ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); + ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * @@ -1822,14 +2345,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize) size_t itxsize, lrsize; itx_t *itx; + ASSERT3U(olrsize, >=, sizeof (lr_t)); lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); + ASSERT3U(lrsize, >=, olrsize); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_lr.lrc_seq = 0; /* defensive */ - bzero((char *)&itx->itx_lr + olrsize, lrsize - olrsize); + memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize); itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; itx->itx_callback_data = NULL; @@ -1838,9 +2363,26 @@ zil_itx_create(uint64_t txtype, size_t olrsize) return (itx); } +static itx_t * +zil_itx_clone(itx_t *oitx) +{ + ASSERT3U(oitx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(oitx->itx_size, ==, + offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen); + + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); + memcpy(itx, oitx, oitx->itx_size); + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; + return (itx); +} + void zil_itx_destroy(itx_t *itx) { + ASSERT3U(itx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(itx->itx_lr.lrc_reclen, ==, + itx->itx_size - offsetof(itx_t, itx_lr)); IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); @@ -1865,11 +2407,11 @@ zil_itxg_clean(void *arg) itx_async_node_t *ian; list = &itxs->i_sync_list; - while ((itx = list_head(list)) != NULL) { + while ((itx = list_remove_head(list)) != NULL) { /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via - * zil_lwb_commit(), and free'd in that function. Having + * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * @@ -1888,7 +2430,6 @@ zil_itxg_clean(void *arg) if (itx->itx_lr.lrc_txtype == TX_COMMIT) zil_commit_waiter_skip(itx->itx_private); - list_remove(list, itx); zil_itx_destroy(itx); } @@ -1896,8 +2437,7 @@ zil_itxg_clean(void *arg) t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; - while ((itx = list_head(list)) != NULL) { - list_remove(list, itx); + while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); @@ -1926,7 +2466,7 @@ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; list_t clean_list; @@ -1953,13 +2493,13 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; - ian = avl_find(t, &oid, &where); + ian_search.ia_foid = oid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } - while ((itx = list_head(&clean_list)) != NULL) { - list_remove(&clean_list, itx); + while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); @@ -2090,10 +2630,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ -static void +static uint64_t zil_get_commit_list(zilog_t *zilog) { - uint64_t otxg, txg; + uint64_t otxg, txg, wtxg = 0; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2127,10 +2667,33 @@ zil_get_commit_list(zilog_t *zilog) */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); - list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); + list_t *sync_list = &itxg->itxg_itxs->i_sync_list; + itx_t *itx = NULL; + if (unlikely(zilog->zl_suspend > 0)) { + /* + * ZIL was just suspended, but we lost the race. + * Allow all earlier itxs to be committed, but ask + * caller to do txg_wait_synced(txg) for any new. + */ + if (!list_is_empty(sync_list)) + wtxg = MAX(wtxg, txg); + } else { + itx = list_head(sync_list); + list_move_tail(commit_list, sync_list); + } mutex_exit(&itxg->itxg_lock); + + while (itx != NULL) { + uint64_t s = zil_itx_full_size(itx); + zilog->zl_cur_size += s; + zilog->zl_cur_left += s; + s = zil_itx_record_size(itx); + zilog->zl_cur_max = MAX(zilog->zl_cur_max, s); + itx = list_next(commit_list, itx); + } } + return (wtxg); } /* @@ -2140,7 +2703,7 @@ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; @@ -2170,7 +2733,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid) */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { - ian = avl_find(t, &foid, &where); + ian_search.ia_foid = foid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); @@ -2224,7 +2788,6 @@ zil_prune_commit_list(zilog_t *zilog) zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); - itx->itx_private = NULL; } mutex_exit(&zilog->zl_lock); @@ -2261,7 +2824,27 @@ zil_commit_writer_stall(zilog_t *zilog) */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); txg_wait_synced(zilog->zl_dmu_pool, 0); - ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); +} + +static void +zil_burst_done(zilog_t *zilog) +{ + if (!list_is_empty(&zilog->zl_itx_commit_list) || + zilog->zl_cur_size == 0) + return; + + if (zilog->zl_parallel) + zilog->zl_parallel--; + + uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1); + zilog->zl_prev_rotor = r; + zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size, + &zilog->zl_prev_min[r]); + + zilog->zl_cur_size = 0; + zilog->zl_cur_max = 0; + zilog->zl_cur_left = 0; } /* @@ -2271,12 +2854,12 @@ zil_commit_writer_stall(zilog_t *zilog) * lwb will be issued to the zio layer to be written to disk. */ static void -zil_process_commit_list(zilog_t *zilog) +zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; list_t nolwb_waiters; - lwb_t *lwb; + lwb_t *lwb, *plwb; itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2285,7 +2868,7 @@ zil_process_commit_list(zilog_t *zilog) * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ - if (list_head(&zilog->zl_itx_commit_list) == NULL) + if (list_is_empty(&zilog->zl_itx_commit_list)) return; list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -2296,12 +2879,32 @@ zil_process_commit_list(zilog_t *zilog) if (lwb == NULL) { lwb = zil_create(zilog); } else { - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + /* + * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will + * have already been created (zl_lwb_list not empty). + */ + zil_commit_activate_saxattr_feature(zilog); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); + + /* + * If the lwb is still opened, it means the workload is really + * multi-threaded and we won the chance of write aggregation. + * If it is not opened yet, but previous lwb is still not + * flushed, it still means the workload is multi-threaded, but + * there was too much time between the commits to aggregate, so + * we try aggregation next times, but without too much hopes. + */ + if (lwb->lwb_state == LWB_STATE_OPENED) { + zilog->zl_parallel = ZIL_BURSTS; + } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) + != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) { + zilog->zl_parallel = MAX(zilog->zl_parallel, + ZIL_BURSTS / 2); + } } - while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { + while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; @@ -2315,8 +2918,6 @@ zil_process_commit_list(zilog_t *zilog) zilog_t *, zilog, itx_t *, itx); } - list_remove(&zilog->zl_itx_commit_list, itx); - boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); @@ -2366,22 +2967,31 @@ zil_process_commit_list(zilog_t *zilog) */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { - lwb = zil_lwb_commit(zilog, itx, lwb); - - if (lwb == NULL) + lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); + if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); - else - list_insert_tail(&lwb->lwb_itxs, itx); + } else if ((zcw->zcw_lwb != NULL && + zcw->zcw_lwb != lwb) || zcw->zcw_done) { + /* + * Our lwb is done, leave the rest of + * itx list to somebody else who care. + */ + zilog->zl_parallel = ZIL_BURSTS; + zilog->zl_cur_left -= + zil_itx_full_size(itx); + break; + } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } - list_insert_tail(&nolwb_itxs, itx); } + zilog->zl_cur_left -= zil_itx_full_size(itx); } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); + zilog->zl_cur_left -= zil_itx_full_size(itx); zil_itx_destroy(itx); } } @@ -2393,6 +3003,8 @@ zil_process_commit_list(zilog_t *zilog) * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* @@ -2402,54 +3014,45 @@ zil_process_commit_list(zilog_t *zilog) * normal. */ zil_commit_waiter_t *zcw; - while ((zcw = list_head(&nolwb_waiters)) != NULL) { + while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) zil_commit_waiter_skip(zcw); - list_remove(&nolwb_waiters, zcw); - } /* * And finally, we have to destroy the itx's that * couldn't be committed to an lwb; this will also call * the itx's callback if one exists for the itx. */ - while ((itx = list_head(&nolwb_itxs)) != NULL) { - list_remove(&nolwb_itxs, itx); + while ((itx = list_remove_head(&nolwb_itxs)) != NULL) zil_itx_destroy(itx); - } } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + ASSERT(lwb->lwb_state == LWB_STATE_NEW || + lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" - * variable is in one of the following states: "closed" - * or "open". + * variable is in "new" or "opened" state. * - * If it's "closed", then no itxs have been committed to - * it, so there's no point in issuing its zio (i.e. it's - * "empty"). + * If it's "new", then no itxs have been committed to it, so + * there's no point in issuing its zio (i.e. it's "empty"). * - * If it's "open", then it contains one or more itxs that + * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * - * 1. Ideally, there will be more ZIL activity occurring - * on the system, such that this function will be - * immediately called again (not necessarily by the same - * thread) and this lwb's zio will be issued via - * zil_lwb_commit(). This way, the lwb is guaranteed to - * be "full" when it is issued to disk, and we'll make - * use of the lwb's size the best we can. + * 1. Ideally, there will be more ZIL activity occurring on + * the system, such that this function will be immediately + * called again by different thread and this lwb will be + * closed by zil_lwb_assign(). This way, the lwb will be + * "full" when it is issued to disk, and we'll make use of + * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on - * the system, such that this lwb's zio isn't issued via - * zil_lwb_commit(), zil_commit_waiter() will issue the - * lwb's zio. If this occurs, the lwb is not guaranteed + * the system, zil_commit_waiter() will close it and issue + * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. @@ -2470,6 +3073,16 @@ zil_process_commit_list(zilog_t *zilog) * possible, without significantly impacting the latency * of each individual itx. */ + if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + zil_burst_done(zilog); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + zil_commit_writer_stall(zilog); + } + } } } @@ -2487,12 +3100,17 @@ zil_process_commit_list(zilog_t *zilog) * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ -static void +static uint64_t zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { + list_t ilwbs; + lwb_t *lwb; + uint64_t wtxg = 0; + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); + list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { @@ -2515,14 +3133,18 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) goto out; } - ZIL_STAT_BUMP(zil_commit_writer_count); + ZIL_STAT_BUMP(zilog, zil_commit_writer_count); - zil_get_commit_list(zilog); + wtxg = zil_get_commit_list(zilog); zil_prune_commit_list(zilog); - zil_process_commit_list(zilog); + zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); + while ((lwb = list_remove_head(&ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + list_destroy(&ilwbs); + return (wtxg); } static void @@ -2534,7 +3156,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can @@ -2543,13 +3165,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + if (lwb->lwb_state != LWB_STATE_OPENED) return; /* - * In order to call zil_lwb_write_issue() we must hold the + * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order @@ -2567,8 +3187,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ - if (zcw->zcw_done) - goto out; + if (zcw->zcw_done) { + mutex_exit(&zilog->zl_issuer_lock); + return; + } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -2578,26 +3200,33 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition - * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb - * _can_ transition from ISSUED to DONE, but it's OK to race with + * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb + * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in - * the ISSUED or DONE states. + * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on - * if it's ISSUED or OPENED, and block any other threads that might - * attempt to issue this lwb. For that reason we hold the + * if it's OPENED or CLOSED, and block any other threads that might + * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_issue() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ - if (lwb->lwb_state == LWB_STATE_ISSUED || - lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) - goto out; + if (lwb->lwb_state != LWB_STATE_OPENED) { + mutex_exit(&zilog->zl_issuer_lock); + return; + } - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + /* + * We do not need zcw_lock once we hold zl_issuer_lock and know lwb + * is still open. But we have to drop it to avoid a deadlock in case + * callback of zio issued by zil_lwb_write_issue() try to get it, + * while zil_lwb_write_issue() is blocked on attempt to issue next + * lwb it found in LWB_STATE_READY state. + */ + mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and @@ -2605,55 +3234,27 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); + zil_burst_done(zilog); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); - IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); - - /* - * Since the lwb's zio hadn't been issued by the time this thread - * reached its timeout, we reset the zilog's "zl_cur_used" field - * to influence the zil block size selection algorithm. - * - * By having to issue the lwb's zio here, it means the size of the - * lwb was too large, given the incoming throughput of itxs. By - * setting "zl_cur_used" to zero, we communicate this fact to the - * block size selection algorithm, so it can take this information - * into account, and potentially select a smaller size for the - * next lwb block that is allocated. - */ - zilog->zl_cur_used = 0; + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); if (nlwb == NULL) { /* - * When zil_lwb_write_issue() returns NULL, this + * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. - * - * We must drop the commit waiter's lock prior to - * calling zil_commit_writer_stall() or else we can wind - * up with the following deadlock: - * - * - This thread is waiting for the txg to sync while - * holding the waiter's lock; txg_wait_synced() is - * used within txg_commit_writer_stall(). - * - * - The txg can't sync because it is waiting for this - * lwb's zio callback to call dmu_tx_commit(). - * - * - The lwb's zio callback can't call dmu_tx_commit() - * because it's blocked trying to acquire the waiter's - * lock, which occurs prior to calling dmu_tx_commit() */ - mutex_exit(&zcw->zcw_lock); + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); - mutex_enter(&zcw->zcw_lock); + mutex_exit(&zilog->zl_issuer_lock); + } else { + mutex_exit(&zilog->zl_issuer_lock); + zil_lwb_write_issue(zilog, lwb); } - -out: - mutex_exit(&zilog->zl_issuer_lock); - ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + mutex_enter(&zcw->zcw_lock); } /* @@ -2667,7 +3268,7 @@ out: * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using - * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() + * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. @@ -2718,7 +3319,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ - IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); + IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); @@ -2766,6 +3367,8 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) */ IMPLY(lwb != NULL, + lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); @@ -2812,7 +3415,14 @@ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + /* + * Since we are not going to create any new dirty data, and we + * can even help with clearing the existing dirty data, we + * should not be subject to the dirty data based delays. We + * use TXG_NOTHROTTLE to bypass the delay mechanism. + */ + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; @@ -2869,7 +3479,7 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * queue prior to zil_commit() having been called, and which itxs were * added after zil_commit() was called. * - * The commit it is special; it doesn't have any on-disk representation. + * The commit itx is special; it doesn't have any on-disk representation. * When a commit itx is "committed" to an lwb, the waiter associated * with it is linked onto the lwb's list of waiters. Then, when that lwb * completes, each waiter on the lwb's list is marked done and signaled @@ -2884,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion - * callback of the root zio for the DKIOCFLUSHWRITECACHE commands - * that are sent to the vdevs upon completion of the lwb zio. + * callback of the root zio for the flush commands that are sent to + * the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as @@ -2992,7 +3602,7 @@ zil_commit(zilog_t *zilog, uint64_t foid) void zil_commit_impl(zilog_t *zilog, uint64_t foid) { - ZIL_STAT_BUMP(zil_commit_count); + ZIL_STAT_BUMP(zilog, zil_commit_count); /* * Move the "async" itxs for the specified foid to the "sync" @@ -3023,7 +3633,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); - zil_commit_writer(zilog, zcw); + uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { @@ -3038,6 +3648,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (wtxg != 0) { + txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); @@ -3062,6 +3674,8 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) if (spa_sync_pass(spa) != 1) return; + zil_lwb_flush_wait_all(zilog, txg); + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); @@ -3074,11 +3688,13 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); - bzero(zh, sizeof (zil_header_t)); - bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); + memset(zh, 0, sizeof (zil_header_t)); + memset(zilog->zl_replayed_seq, 0, + sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* @@ -3091,15 +3707,27 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; + } else { + /* + * A destroyed ZIL chain can't contain any TX_SETSAXATTR + * records. So, deactivate the feature for this dataset. + * We activate it again when we start a new ZIL chain. + */ + if (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_ZILSAXATTR)) + dsl_dataset_deactivate_feature(ds, + SPA_FEATURE_ZILSAXATTR, tx); } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; - if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) + if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || + lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); - zio_free(spa, txg, &lwb->lwb_blk); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* @@ -3108,29 +3736,17 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ - if (list_head(&zilog->zl_lwb_list) == NULL) + if (list_is_empty(&zilog->zl_lwb_list)) BP_ZERO(&zh->zh_log); } - /* - * Remove fastwrite on any blocks that have been pre-allocated for - * the next commit. This prevents fastwrite counter pollution by - * unused, long-lived LWBs. - */ - for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { - if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) { - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 0; - } - } - mutex_exit(&zilog->zl_lock); } -/* ARGSUSED */ static int zil_lwb_cons(void *vbuf, void *unused, int kmflag) { + (void) unused, (void) kmflag; lwb_t *lwb = vbuf; list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), @@ -3141,10 +3757,10 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) return (0); } -/* ARGSUSED */ static void zil_lwb_dest(void *vbuf, void *unused) { + (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); avl_destroy(&lwb->lwb_vdev_tree); @@ -3161,13 +3777,16 @@ zil_init(void) zil_zcw_cache = kmem_cache_create("zil_zcw_cache", sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - zil_ksp = kstat_create("zfs", 0, "zil", "misc", + zil_sums_init(&zil_sums_global); + zil_kstats_global = kstat_create("zfs", 0, "zil", "misc", KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (zil_ksp != NULL) { - zil_ksp->ks_data = &zil_stats; - kstat_install(zil_ksp); + if (zil_kstats_global != NULL) { + zil_kstats_global->ks_data = &zil_stats; + zil_kstats_global->ks_update = zil_kstats_global_update; + zil_kstats_global->ks_private = NULL; + kstat_install(zil_kstats_global); } } @@ -3177,10 +3796,12 @@ zil_fini(void) kmem_cache_destroy(zil_zcw_cache); kmem_cache_destroy(zil_lwb_cache); - if (zil_ksp != NULL) { - kstat_delete(zil_ksp); - zil_ksp = NULL; + if (zil_kstats_global != NULL) { + kstat_delete(zil_kstats_global); + zil_kstats_global = NULL; } + + zil_sums_fini(&zil_sums_global); } void @@ -3212,10 +3833,13 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; - zilog->zl_max_block_size = zil_maxblocksize; + zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize, + ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ), + spa_maxblocksize(dmu_objset_spa(os))); mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, @@ -3229,6 +3853,12 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) offsetof(itx_t, itx_node)); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); + + for (int i = 0; i < ZIL_BURSTS; i++) { + zilog->zl_prev_opt[i] = zilog->zl_max_block_size - + sizeof (zil_chain_t); + } return (zilog); } @@ -3264,8 +3894,10 @@ zil_free(zilog_t *zilog) mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); + mutex_destroy(&zilog->zl_lwb_io_lock); cv_destroy(&zilog->zl_cv_suspend); + cv_destroy(&zilog->zl_lwb_io_cv); kmem_free(zilog, sizeof (zilog_t)); } @@ -3274,7 +3906,7 @@ zil_free(zilog_t *zilog) * Open an intent log. */ zilog_t * -zil_open(objset_t *os, zil_get_data_t *get_data) +zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums) { zilog_t *zilog = dmu_objset_zil(os); @@ -3283,6 +3915,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data) ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; + zilog->zl_sums = zil_sums; return (zilog); } @@ -3299,23 +3932,33 @@ zil_close(zilog_t *zilog) if (!dmu_objset_is_snapshot(zilog->zl_os)) { zil_commit(zilog, 0); } else { - ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); } mutex_enter(&zilog->zl_lock); + txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); - if (lwb == NULL) - txg = zilog->zl_dirty_max_txg; - else - txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); + if (lwb != NULL) { + txg = MAX(txg, lwb->lwb_alloc_txg); + txg = MAX(txg, lwb->lwb_max_txg); + } mutex_exit(&zilog->zl_lock); /* - * We need to use txg_wait_synced() to wait long enough for the - * ZIL to be clean, and to wait for all pending lwbs to be - * written out. + * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends + * on the time when the dmu_tx transaction is assigned in + * zil_lwb_write_issue(). + */ + mutex_enter(&zilog->zl_lwb_io_lock); + txg = MAX(zilog->zl_lwb_max_issued_txg, txg); + mutex_exit(&zilog->zl_lwb_io_lock); + + /* + * We need to use txg_wait_synced() to wait until that txg is synced. + * zil_sync() will guarantee all lwbs up to that txg have been + * written out, flushed, and cleaned. */ if (txg != 0) txg_wait_synced(zilog->zl_dmu_pool, txg); @@ -3332,22 +3975,17 @@ zil_close(zilog_t *zilog) * We should have only one lwb left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); - lwb = list_head(&zilog->zl_lwb_list); + lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { - ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); - - if (lwb->lwb_fastwrite) - metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - - list_remove(&zilog->zl_lwb_list, lwb); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); } -static char *suspend_tag = "zil suspending"; +static const char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor @@ -3461,7 +4099,7 @@ zil_suspend(const char *osname, void **cookiep) /* * We need to use zil_commit_impl to ensure we wait for all - * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed + * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be @@ -3508,7 +4146,7 @@ zil_resume(void *cookie) } typedef struct zil_replay_arg { - zil_replay_func_t **zr_replay; + zil_replay_func_t *const *zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; @@ -3570,7 +4208,7 @@ zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra, /* * Make a copy of the data so we can revise and extend it. */ - bcopy(lr, zr->zr_lr, reclen); + memcpy(zr->zr_lr, lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. @@ -3615,10 +4253,11 @@ zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra, return (0); } -/* ARGSUSED */ static int zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { + (void) bp, (void) arg, (void) claim_txg; + zilog->zl_replay_blks++; return (0); @@ -3626,17 +4265,18 @@ zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) /* * If this dataset has a non-empty intent log, replay it and destroy it. + * Return B_TRUE if there were any entries to replay. */ -void -zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) +boolean_t +zil_replay(objset_t *os, void *arg, + zil_replay_func_t *const replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { - zil_destroy(zilog, B_TRUE); - return; + return (zil_destroy(zilog, B_TRUE)); } zr.zr_replay = replay_func; @@ -3659,6 +4299,8 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; + + return (B_TRUE); } boolean_t @@ -3677,13 +4319,12 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *tx) return (B_FALSE); } -/* ARGSUSED */ int zil_reset(const char *osname, void *arg) { - int error; + (void) arg; - error = zil_suspend(osname, NULL); + int error = zil_suspend(osname, NULL); /* EACCES means crypto key not loaded */ if ((error == EACCES) || (error == EBUSY)) return (SET_ERROR(error)); @@ -3714,9 +4355,11 @@ EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); +EXPORT_SYMBOL(zil_sums_init); +EXPORT_SYMBOL(zil_sums_fini); +EXPORT_SYMBOL(zil_kstat_values_update); -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, @@ -3725,9 +4368,11 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, "Disable ZIL cache flushes"); -ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, "Limit in bytes slog sync writes per commit"); -ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, "Limit in bytes of ZIL log block size"); -/* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, + "Limit in bytes WR_COPIED size"); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index c016fa323b41..d68d5ababe79 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,10 +20,10 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. */ @@ -41,6 +41,7 @@ #include <sys/zio_checksum.h> #include <sys/dmu_objset.h> #include <sys/arc.h> +#include <sys/brt.h> #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> @@ -57,33 +58,33 @@ * I/O type descriptions * ========================================================================== */ -const char *zio_type_name[ZIO_TYPES] = { +const char *const zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ - "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" + "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; -int zio_deadman_log_all = B_FALSE; +static int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ -kmem_cache_t *zio_cache; -kmem_cache_t *zio_link_cache; +static kmem_cache_t *zio_cache; +static kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) -uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; -uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ -int zio_slow_io_ms = (30 * MILLISEC); +static uint_t zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) @@ -114,9 +115,15 @@ int zio_slow_io_ms = (30 * MILLISEC); * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ -int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ -int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */ -int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ + +/* defer frees starting in this pass */ +uint_t zfs_sync_pass_deferred_free = 2; + +/* don't compress starting in this pass */ +static uint_t zfs_sync_pass_dont_compress = 8; + +/* rewrite new bps starting in this pass */ +static uint_t zfs_sync_pass_rewrite = 2; /* * An allocating zio is one that either currently has the DVA allocate @@ -129,12 +136,12 @@ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ * allocations as well. */ int zio_exclude_metadata = 0; -int zio_requeue_io_start_cut_in_line = 1; +static int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG -int zio_buf_debug_limit = 16384; +static const int zio_buf_debug_limit = 16384; #else -int zio_buf_debug_limit = 0; +static const int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); @@ -151,32 +158,22 @@ zio_init(void) zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - /* - * For small buffers, we want a cache for each multiple of - * SPA_MINBLOCKSIZE. For larger buffers, we want a cache - * for each quarter-power of 2. - */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; - size_t p2 = size; - size_t align = 0; - size_t data_cflags, cflags; - - data_cflags = KMC_NODEBUG; - cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? - KMC_NODEBUG : 0; + size_t align, cflags, data_cflags; + char name[32]; -#if defined(_ILP32) && defined(_KERNEL) /* - * Cache size limited to 1M on 32-bit platforms until ARC - * buffers no longer require virtual address space. + * Create cache for each half-power of 2 size, starting from + * SPA_MINBLOCKSIZE. It should give us memory space efficiency + * of ~7/8, sufficient for transient allocations mostly using + * these caches. */ - if (size > zfs_max_recordsize) - break; -#endif - + size_t p2 = size; while (!ISP2(p2)) p2 &= p2 - 1; + if (!IS_P2ALIGNED(size, p2 / 2)) + continue; #ifndef _KERNEL /* @@ -187,47 +184,37 @@ zio_init(void) */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; - /* - * Here's the problem - on 4K native devices in userland on - * Linux using O_DIRECT, buffers must be 4K aligned or I/O - * will fail with EINVAL, causing zdb (and others) to coredump. - * Since userland probably doesn't need optimized buffer caches, - * we just force 4K alignment on everything. - */ - align = 8 * SPA_MINBLOCKSIZE; -#else - if (size < PAGESIZE) { - align = SPA_MINBLOCKSIZE; - } else if (IS_P2ALIGNED(size, p2 >> 2)) { - align = PAGESIZE; - } #endif - if (align != 0) { - char name[36]; - if (cflags == data_cflags) { - /* - * Resulting kmem caches would be identical. - * Save memory by creating only one. - */ - (void) snprintf(name, sizeof (name), - "zio_buf_comb_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, - size, align, NULL, NULL, NULL, NULL, NULL, - cflags); - zio_data_buf_cache[c] = zio_buf_cache[c]; - continue; - } - (void) snprintf(name, sizeof (name), "zio_buf_%lu", - (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, cflags); - - (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", - (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, data_cflags); + if (IS_P2ALIGNED(size, PAGESIZE)) + align = PAGESIZE; + else + align = 1 << (highbit64(size ^ (size - 1)) - 1); + + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; + data_cflags = KMC_NODEBUG; + if (cflags == data_cflags) { + /* + * Resulting kmem caches would be identical. + * Save memory by creating only one. + */ + (void) snprintf(name, sizeof (name), + "zio_buf_comb_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + zio_data_buf_cache[c] = zio_buf_cache[c]; + continue; } + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, cflags); + + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, align, + NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { @@ -308,6 +295,53 @@ zio_fini(void) * ========================================================================== */ +#ifdef ZFS_DEBUG +static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; +#endif + +/* + * Use empty space after the buffer to detect overflows. + * + * Since zio_init() creates kmem caches only for certain set of buffer sizes, + * allocations of different sizes may have some unused space after the data. + * Filling part of that space with a known pattern on allocation and checking + * it on free should allow us to detect some buffer overflows. + */ +static void +zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) + *canary = zio_buf_canary; +#endif +} + +static void +zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) { + if (unlikely(*canary != zio_buf_canary)) { + PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx", + p, size, (canary - p) * sizeof (ulong_t), + *canary, zio_buf_canary); + } + } +#endif +} + /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's @@ -324,7 +358,9 @@ zio_buf_alloc(size_t size) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif - return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_buf_cache, c); + return (p); } /* @@ -340,7 +376,9 @@ zio_data_buf_alloc(size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_data_buf_cache, c); + return (p); } void @@ -353,6 +391,7 @@ zio_buf_free(void *buf, size_t size) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif + zio_buf_check_canary(buf, size, zio_buf_cache, c); kmem_cache_free(zio_buf_cache[c], buf); } @@ -363,12 +402,14 @@ zio_data_buf_free(void *buf, size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + zio_buf_check_canary(buf, size, zio_data_buf_cache, c); kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { + (void) size; abd_free((abd_t *)abd); } @@ -514,8 +555,9 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) /* * If this is an authenticated block, just check the MAC. It would be - * nice to separate this out into its own flag, but for the moment - * enum zio_flag is out of bits. + * nice to separate this out into its own flag, but when this was done, + * we had run out of bits in what is now zio_flag_t. Future cleanup + * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { @@ -570,7 +612,8 @@ error: if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { - spa_log_error(spa, &zio->io_bookmark); + spa_log_error(spa, &zio->io_bookmark, + BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } @@ -625,8 +668,6 @@ zio_unique_parent(zio_t *cio) void zio_add_child(zio_t *pio, zio_t *cio) { - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); - /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. @@ -635,6 +676,12 @@ zio_add_child(zio_t *pio, zio_t *cio) */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + /* Parent should not have READY stage if child doesn't have it. */ + IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && + (cio->io_child_type != ZIO_CHILD_VDEV), + (pio->io_pipeline & ZIO_STAGE_READY) == 0); + + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -643,19 +690,53 @@ zio_add_child(zio_t *pio, zio_t *cio) ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); - pio->io_child_count++; - cio->io_parent_count++; - mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } +void +zio_add_child_first(zio_t *pio, zio_t *cio) +{ + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + + /* Parent should not have READY stage if child doesn't have it. */ + IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 && + (cio->io_child_type != ZIO_CHILD_VDEV), + (pio->io_pipeline & ZIO_STAGE_READY) == 0); + + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + zl->zl_parent = pio; + zl->zl_child = cio; + + ASSERT(list_is_empty(&cio->io_parent_list)); + list_insert_head(&cio->io_parent_list, zl); + + mutex_enter(&pio->io_lock); + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + uint64_t *countp = pio->io_children[cio->io_child_type]; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + countp[w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + + mutex_exit(&pio->io_lock); +} + static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { @@ -668,9 +749,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); - pio->io_child_count--; - cio->io_parent_count--; - mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); @@ -725,7 +803,10 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, /* * If we can tell the caller to execute this parent next, do - * so. Otherwise dispatch the parent zio as its own task. + * so. We do this if the parent's zio type matches the child's + * type, or if it's a zio_null() with no done callback, and so + * has no actual work to do. Otherwise dispatch the parent zio + * in its own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch @@ -744,7 +825,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ - if (next_to_executep != NULL && *next_to_executep == NULL) { + if (next_to_executep != NULL && *next_to_executep == NULL && + (pio->io_type == zio->io_type || + (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); @@ -804,7 +887,7 @@ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, - enum zio_flag flags, vdev_t *vd, uint64_t offset, + zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { @@ -821,7 +904,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); - bzero(zio, sizeof (zio_t)); + memset(zio, 0, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); @@ -842,12 +925,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { - zio->io_bp = (blkptr_t *)bp; - zio->io_bp_copy = *bp; - zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || - zio->io_child_type == ZIO_CHILD_DDT) + zio->io_child_type == ZIO_CHILD_DDT) { + zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ + } else { + zio->io_bp = (blkptr_t *)bp; + } + zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) @@ -869,8 +954,10 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; + zio->io_allocator = ZIO_ALLOCATOR_NONE; - zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) || + (pipeline & ZIO_STAGE_READY) == 0; zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) @@ -882,7 +969,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; - zio_add_child(pio, zio); + zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); @@ -890,7 +977,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, return (zio); } -static void +void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); @@ -901,9 +988,13 @@ zio_destroy(zio_t *zio) kmem_cache_free(zio_cache, zio); } +/* + * ZIO intended to be between others. Provides synchronization at READY + * and DONE pipeline stages and calls the respective callbacks. + */ zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, - void *private, enum zio_flag flags) + void *private, zio_flag_t flags) { zio_t *zio; @@ -914,10 +1005,22 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, return (zio); } +/* + * ZIO intended to be a root of a tree. Unlike null ZIO does not have a + * READY pipeline stage (is ready on creation), so it should not be used + * as child of any ZIO that may need waiting for grandchildren READY stage + * (any other ZIO type). + */ zio_t * -zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) +zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { - return (zio_null(NULL, spa, NULL, done, private, flags)); + zio_t *zio; + + zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE); + + return (zio); } static int @@ -931,9 +1034,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); + zfs_dbgmsg("bad blkptr at %px: " + "DVA[0]=%#llx/%#llx " + "DVA[1]=%#llx/%#llx " + "DVA[2]=%#llx/%#llx " + "prop=%#llx " + "pad=%#llx,%#llx " + "phys_birth=%#llx " + "birth=%#llx " + "fill=%#llx " + "cksum=%#llx/%#llx/%#llx/%#llx", + bp, + (long long)bp->blk_dva[0].dva_word[0], + (long long)bp->blk_dva[0].dva_word[1], + (long long)bp->blk_dva[1].dva_word[0], + (long long)bp->blk_dva[1].dva_word[1], + (long long)bp->blk_dva[2].dva_word[0], + (long long)bp->blk_dva[2].dva_word[1], + (long long)bp->blk_prop, + (long long)bp->blk_pad[0], + (long long)bp->blk_pad[1], + (long long)BP_GET_PHYSICAL_BIRTH(bp), + (long long)BP_GET_LOGICAL_BIRTH(bp), + (long long)bp->blk_fill, + (long long)bp->blk_cksum.zc_word[0], + (long long)bp->blk_cksum.zc_word[1], + (long long)bp->blk_cksum.zc_word[2], + (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: @@ -954,49 +1083,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * - * Modes for zfs_blkptr_verify: - * 1) BLK_VERIFY_ONLY (evaluate the block) - * 2) BLK_VERIFY_LOG (evaluate the block and log problems) - * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) + * Values for blk_verify_flag: + * BLK_VERIFY_ONLY: evaluate the block + * BLK_VERIFY_LOG: evaluate the block and log problems + * BLK_VERIFY_HALT: call zfs_panic_recover on error + * + * Values for blk_config_flag: + * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer + * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be + * obtained for reader + * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better + * performance */ boolean_t -zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, - enum blk_verify_flag blk_verify) +zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid TYPE %llu", + "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } - if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || - BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { + if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid CHECKSUM %llu", + "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } - if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || - BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { + if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid COMPRESS %llu", + "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid LSIZE %llu", + "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid PSIZE %llu", + "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid ETYPE %llu", + "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } @@ -1008,17 +1142,27 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (!spa->spa_trust_config) return (errors == 0); - if (!config_held) - spa_config_enter(spa, SCL_VDEV, bp, RW_READER); - else + switch (blk_config) { + case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); + break; + case BLK_CONFIG_NEEDED: + spa_config_enter(spa, SCL_VDEV, bp, RW_READER); + break; + case BLK_CONFIG_SKIP: + return (errors == 0); + default: + panic("invalid blk_config %u", blk_config); + } + /* * Pool-specific checks. * - * Note: it would be nice to verify that the blk_birth and - * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() - * allows the birth time of log blocks (and dmu_sync()-ed blocks - * that are in the log) to be arbitrarily large. + * Note: it would be nice to verify that the logical birth + * and physical birth are not too large. However, + * spa_freeze() allows the birth time of log blocks (and + * dmu_sync()-ed blocks that are in the log) to be arbitrarily + * large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; @@ -1026,20 +1170,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has hole VDEV %llu", + "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } @@ -1057,13 +1201,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid OFFSET %llu", + "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } - if (errors > 0) - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); - if (!config_held) + if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); @@ -1072,6 +1214,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { + (void) bp; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) @@ -1102,11 +1245,11 @@ zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) + zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; - zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, + zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? @@ -1119,9 +1262,8 @@ zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *private, zio_priority_t priority, enum zio_flag flags, - const zbookmark_phys_t *zb) + zio_done_func_t *done, void *private, zio_priority_t priority, + zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -1141,7 +1283,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_ready = ready; zio->io_children_ready = children_ready; - zio->io_physdone = physdone; zio->io_prop = *zp; /* @@ -1163,7 +1304,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) + zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; @@ -1175,12 +1316,14 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, } void -zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) +zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, + boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); + ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed @@ -1189,6 +1332,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; + zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } @@ -1197,7 +1341,7 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { - (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We @@ -1206,7 +1350,6 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) */ if (BP_IS_EMBEDDED(bp)) return; - metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be @@ -1222,7 +1365,9 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && - !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { + !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || + brt_maybe_exists(spa, bp)) { + metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); @@ -1236,7 +1381,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - enum zio_flag flags) + zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); @@ -1248,11 +1393,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, arc_freed(spa, bp); dsl_scan_freed(spa, bp); - if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) { + if (BP_IS_GANG(bp) || + BP_GET_DEDUP(bp) || + brt_maybe_exists(spa, bp)) { /* - * GANG and DEDUP blocks can induce a read (for the gang block - * header, or the DDT), so issue them asynchronously so that - * this thread is not tied up. + * GANG, DEDUP and BRT blocks can induce a read (for the gang + * block header, the DDT or the BRT), so issue them + * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; @@ -1269,12 +1416,12 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio_done_func_t *done, void *private, enum zio_flag flags) + zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; - (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); @@ -1291,7 +1438,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ - ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, + ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ @@ -1305,33 +1452,9 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, } zio_t * -zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, enum zio_flag flags) -{ - zio_t *zio; - int c; - - if (vd->vdev_children == 0) { - zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, - ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, - ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - - zio->io_cmd = cmd; - } else { - zio = zio_null(pio, spa, NULL, NULL, NULL, flags); - - for (c = 0; c < vd->vdev_children; c++) - zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - done, private, flags)); - } - - return (zio); -} - -zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, - enum zio_flag flags, enum trim_flag trim_flags) + zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; @@ -1351,7 +1474,7 @@ zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, boolean_t labels) + zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; @@ -1372,7 +1495,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, boolean_t labels) + zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; @@ -1409,7 +1532,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) + zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; @@ -1468,22 +1591,17 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, flags &= ~ZIO_FLAG_IO_ALLOCATING; } - zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); - zio->io_physdone = pio->io_physdone; - if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) - zio->io_logical->io_phys_children++; - return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, - zio_type_t type, zio_priority_t priority, enum zio_flag flags, + zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; @@ -1499,12 +1617,29 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, return (zio); } + +/* + * Send a flush command to the given vdev. Unlike most zio creation functions, + * the flush zios are issued immediately. You can wait on pio to pause until + * the flushes complete. + */ void -zio_flush(zio_t *zio, vdev_t *vd) +zio_flush(zio_t *pio, vdev_t *vd) { - zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, - NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); + const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY; + + if (vd->vdev_nowritecache) + return; + + if (vd->vdev_children == 0) { + zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, + NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0, + NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE)); + } else { + for (uint64_t c = 0; c < vd->vdev_children; c++) + zio_flush(pio, vd->vdev_child[c]); + } } void @@ -1528,6 +1663,19 @@ zio_shrink(zio_t *zio, uint64_t size) } /* + * Round provided allocation size up to a value that can be allocated + * by at least some vdev(s) in the pool with minimum or no additional + * padding and without extra space usage on others + */ +static uint64_t +zio_roundup_alloc_size(spa_t *spa, uint64_t size) +{ + if (size > spa->spa_min_alloc) + return (roundup(size, spa->spa_gcd_alloc)); + return (spa->spa_min_alloc); +} + +/* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== @@ -1565,15 +1713,8 @@ zio_read_bp_init(zio_t *zio) abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } - if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - - if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; @@ -1592,12 +1733,16 @@ zio_write_bp_init(zio_t *zio) blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; - ASSERT(bp->blk_birth != zio->io_txg); - ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (zp->zp_brtwrite) + return (zio); + + ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); + if (BP_IS_EMBEDDED(bp)) return (zio); @@ -1649,7 +1794,7 @@ zio_write_compress(zio_t *zio) blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; - int pass = 1; + uint32_t pass = 1; /* * If our children haven't all reached the ready stage, @@ -1676,7 +1821,7 @@ zio_write_compress(zio_t *zio) ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); - if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { + if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to @@ -1696,19 +1841,23 @@ zio_write_compress(zio_t *zio) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), - spa_max_replication(spa)) == BP_GET_NDVAS(bp)); + ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) || + MIN(zp->zp_copies, spa_max_replication(spa)) + == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { - void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, + void *cbuf = NULL; + psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize, zp->zp_complevel); - if (psize == 0 || psize >= lsize) { + if (psize == 0) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + } else if (psize >= lsize) { + compress = ZIO_COMPRESS_OFF; + if (cbuf != NULL) + zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && @@ -1719,7 +1868,7 @@ zio_write_compress(zio_t *zio) BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); - bp->blk_birth = zio->io_txg; + BP_SET_LOGICAL_BIRTH(bp, zio->io_txg); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); @@ -1733,9 +1882,8 @@ zio_write_compress(zio_t *zio) * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)roundup(psize, - spa->spa_min_alloc); + size_t rounded = (size_t)zio_roundup_alloc_size(spa, + psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1771,9 +1919,15 @@ zio_write_compress(zio_t *zio) zio->io_abd, NULL, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; - } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { - size_t rounded = MIN((size_t)roundup(psize, - spa->spa_min_alloc), lsize); + } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS && + !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) { + /* + * If we are raw receiving an encrypted dataset we should not + * take this codepath because it will change the on-disk block + * and decryption will fail. + */ + size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), + lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); @@ -1795,7 +1949,7 @@ zio_write_compress(zio_t *zio) * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ - if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && + if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); @@ -1809,7 +1963,7 @@ zio_write_compress(zio_t *zio) } if (psize == 0) { - if (zio->io_bp_orig.blk_birth != 0 && + if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); @@ -1869,7 +2023,6 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; - int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and @@ -1887,23 +2040,18 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) /* * If this is a high priority I/O, then use the high priority taskq if - * available. + * available or cut the line otherwise. */ - if ((zio->io_priority == ZIO_PRIORITY_NOW || - zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && - spa->spa_zio_taskq[t][q + 1].stqs_count != 0) - q++; + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) { + if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0) + q++; + else + cutinline = B_TRUE; + } ASSERT3U(q, <, ZIO_TASKQ_TYPES); - /* - * NB: We are assuming that the zio can only be dispatched - * to a single taskq at a time. It would be a grievous error - * to dispatch the zio to another taskq at the same time. - */ - ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, - &zio->io_tqent); + spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline); } static boolean_t @@ -1928,8 +2076,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q) static zio_t * zio_issue_async(zio_t *zio) { + ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio)); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (NULL); } @@ -2027,7 +2175,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth) "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " - "priority=%d flags=0x%x stage=0x%x " + "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " @@ -2037,8 +2185,8 @@ zio_deadman_impl(zio_t *pio, int ziodepth) (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, - pio->io_priority, pio->io_flags, pio->io_stage, - pio->io_pipeline, pio->io_pipeline_trace, + pio->io_priority, (u_longlong_t)pio->io_flags, + pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, @@ -2065,7 +2213,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth) * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void -zio_deadman(zio_t *pio, char *tag) +zio_deadman(zio_t *pio, const char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); @@ -2143,6 +2291,8 @@ zio_execute_stack_check(zio_t *zio) !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); +#else + (void) zio; #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); @@ -2241,6 +2391,9 @@ zio_wait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); mutex_enter(&zio->io_lock); @@ -2277,7 +2430,7 @@ zio_nowait(zio_t *zio) ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && - zio_unique_parent(zio) == NULL) { + list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* @@ -2293,6 +2446,9 @@ zio_nowait(zio_t *zio) ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); + if (zio->io_type == ZIO_TYPE_WRITE) { + spa_select_allocator(zio); + } __zio_execute(zio); } @@ -2306,13 +2462,14 @@ static void zio_reexecute(void *arg) { zio_t *pio = arg; - zio_t *cio, *cio_next; + zio_t *cio, *cio_next, *gio; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); + mutex_enter(&pio->io_lock); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; @@ -2320,8 +2477,16 @@ zio_reexecute(void *arg) pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_state[w] = 0; + pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || + (pio->io_pipeline & ZIO_STAGE_READY) == 0; + pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); + zio_link_t *zl = NULL; + while ((gio = zio_walk_parents(pio, &zl)) != NULL) { + for (int w = 0; w < ZIO_WAIT_TYPES; w++) { + gio->io_children[pio->io_child_type][w] += + !pio->io_state[w]; + } + } for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; @@ -2335,12 +2500,9 @@ zio_reexecute(void *arg) * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ - zio_link_t *zl = NULL; - mutex_enter(&pio->io_lock); + zl = NULL; for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); @@ -2366,8 +2528,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); - cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " - "failure and has been suspended.\n", spa_name(spa)); + if (reason != ZIO_SUSPEND_MMP) { + cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " + "I/O failure and has been suspended.\n", spa_name(spa)); + } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); @@ -2555,11 +2719,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, return (zio); } -/* ARGSUSED */ static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { + (void) gn, (void) data, (void) offset; + zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { @@ -2569,11 +2734,11 @@ zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, return (zio); } -/* ARGSUSED */ static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { + (void) gn, (void) data, (void) offset; return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } @@ -2652,7 +2817,7 @@ zio_gang_tree_assemble_done(zio_t *zio) blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); - ASSERT(zio->io_child_count == 0); + ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; @@ -2751,6 +2916,12 @@ zio_gang_issue(zio_t *zio) } static void +zio_gang_inherit_allocator(zio_t *pio, zio_t *cio) +{ + cio->io_allocator = pio->io_allocator; +} + +static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); @@ -2768,7 +2939,7 @@ zio_write_gang_member_ready(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); - ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { @@ -2806,19 +2977,22 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; - int gbh_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* - * encrypted blocks need DVA[2] free so encrypted gang headers can't - * have a third copy. + * If one copy was requested, store 2 copies of the GBH, so that we + * can still traverse all the data (e.g. to free or scrub) even if a + * block is damaged. Note that we can't store 3 copies of the GBH in + * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. */ - gbh_copies = MIN(copies + 1, spa_max_replication(spa)); - if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) - gbh_copies = SPA_DVAS_PER_BP - 1; + int gbh_copies = copies; + if (gbh_copies == 1) { + gbh_copies = MIN(2, spa_max_replication(spa)); + } + ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2872,7 +3046,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; - bzero(gbh, SPA_GANGBLOCKSIZE); + memset(gbh, 0, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* @@ -2882,6 +3056,8 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(pio, zio); + /* * Create and nowait the gang children. */ @@ -2901,17 +3077,19 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; - bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); - bzero(zp.zp_iv, ZIO_DATA_IV_LEN); - bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); + memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); + memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); + memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio_gang_inherit_allocator(zio, cio); + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); @@ -2932,11 +3110,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - /* - * We didn't allocate this bp, so make sure it doesn't get unmarked. - */ - pio->io_flags &= ~ZIO_FLAG_FASTWRITE; - zio_nowait(zio); return (pio); @@ -2967,6 +3140,7 @@ zio_nop_write(zio_t *zio) blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; + ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); @@ -3000,8 +3174,7 @@ zio_nop_write(zio_t *zio) ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); - ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, - sizeof (uint64_t)) == 0); + ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop); /* * If we're overwriting a block that is currently on an @@ -3009,11 +3182,13 @@ zio_nop_write(zio_t *zio) * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); - vdev_t *tvd = vdev_lookup_top(zio->io_spa, - DVA_GET_VDEV(&bp->blk_dva[0])); - if (tvd->vdev_ops == &vdev_indirect_ops) { - spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); - return (zio); + for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) { + vdev_t *tvd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(&bp_orig->blk_dva[d])); + if (tvd->vdev_ops == &vdev_indirect_ops) { + spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); + return (zio); + } } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); @@ -3027,6 +3202,35 @@ zio_nop_write(zio_t *zio) /* * ========================================================================== + * Block Reference Table + * ========================================================================== + */ +static zio_t * +zio_brt_free(zio_t *zio) +{ + blkptr_t *bp; + + bp = zio->io_bp; + + if (BP_GET_LEVEL(bp) > 0 || + BP_IS_METADATA(bp) || + !brt_maybe_exists(zio->io_spa, bp)) { + return (zio); + } + + if (!brt_entry_decref(zio->io_spa, bp)) { + /* + * This isn't the last reference, so we cannot free + * the data yet. + */ + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } + + return (zio); +} + +/* + * ========================================================================== * Dedup * ========================================================================== */ @@ -3332,14 +3536,14 @@ zio_ddt_write(zio_t *zio) else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { - ASSERT(bp->blk_birth == txg); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, NULL, + zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); @@ -3354,7 +3558,7 @@ zio_ddt_write(zio_t *zio) return (zio); } -ddt_entry_t *freedde; /* for debugging */ +static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) @@ -3398,6 +3602,7 @@ zio_io_to_allocate(spa_t *spa, int allocator) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * Try to place a reservation for this zio. If we're unable to @@ -3434,21 +3639,12 @@ zio_dva_throttle(zio_t *zio) } ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - zbookmark_phys_t *bm = &zio->io_bookmark; - /* - * We want to try to use as many allocators as possible to help improve - * performance, but we also want logically adjacent IOs to be physically - * adjacent to improve sequential read performance. We chunk each object - * into 2^20 block regions, and then hash based on the objset, object, - * level, and region to accomplish both of these goals. - */ - int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, - bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; - zio->io_allocator = allocator; + int allocator = zio->io_allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); @@ -3493,7 +3689,6 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); - flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) @@ -3523,6 +3718,7 @@ zio_dva_allocate(zio_t *zio) * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ + ASSERT(ZIO_HAS_ALLOCATOR(zio)); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -3611,11 +3807,13 @@ zio_dva_claim(zio_t *zio) static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); - if (!BP_IS_HOLE(bp)) - metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); + if (!BP_IS_HOLE(bp)) { + metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp), + B_TRUE); + } if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { @@ -3653,7 +3851,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, @@ -3755,7 +3953,7 @@ zio_vdev_io_start(zio_t *zio) * Note: the code can handle other kinds of writes, * but we don't expect them. */ - if (zio->io_vd->vdev_removing) { + if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); @@ -3857,8 +4055,15 @@ zio_vdev_io_start(zio_t *zio) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) - return (zio); + if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { + /* + * "no-op" injections return success, but do no actual + * work. Just skip the remaining vdev stages. + */ + zio_vdev_io_bypass(zio); + zio_interrupt(zio); + return (NULL); + } if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); @@ -3887,17 +4092,17 @@ zio_vdev_io_done(zio_t *zio) } ASSERT(zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FLUSH || + zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { - vdev_queue_io_done(zio); - - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); + if (zio->io_type != ZIO_TYPE_FLUSH) + vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, @@ -3906,7 +4111,8 @@ zio_vdev_io_done(zio_t *zio) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); - if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { + if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH && + zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { @@ -3917,7 +4123,7 @@ zio_vdev_io_done(zio_t *zio) ops->vdev_op_io_done(zio); - if (unexpected_error) + if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); @@ -3964,7 +4170,6 @@ zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } -/*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { @@ -4009,8 +4214,7 @@ zio_vdev_io_assess(zio_t *zio) ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; - zio->io_flags |= ZIO_FLAG_IO_RETRY | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; + zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); @@ -4043,20 +4247,12 @@ zio_vdev_io_assess(zio_t *zio) * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_IOCTL && - zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) + zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf && - zio->io_physdone != NULL) { - ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); - ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); - zio->io_physdone(zio->io_logical); - } - return (zio); } @@ -4305,12 +4501,12 @@ zio_checksum_verify(zio_t *zio) zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - (void) zfs_ereport_start_checksum(zio->io_spa, - zio->io_vd, &zio->io_bookmark, zio, - zio->io_offset, zio->io_size, &info); mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); + (void) zfs_ereport_start_checksum(zio->io_spa, + zio->io_vd, &zio->io_bookmark, zio, + zio->io_offset, zio->io_size, &info); } } @@ -4364,22 +4560,24 @@ zio_ready(zio_t *zio) zio_t *pio, *pio_next; zio_link_t *zl = NULL; - if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, - ZIO_WAIT_READY)) { + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | + ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || - (zio->io_flags & ZIO_FLAG_NOPWRITE)); + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || + BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } +#ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; +#endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -4388,6 +4586,7 @@ zio_ready(zio_t *zio) ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); /* * We were unable to allocate anything, unreserve and @@ -4418,7 +4617,7 @@ zio_ready(zio_t *zio) } if (zio->io_flags & ZIO_FLAG_NODATA) { - if (BP_IS_GANG(bp)) { + if (bp != NULL && BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); @@ -4474,6 +4673,7 @@ zio_dva_throttle_done(zio_t *zio) } ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT(ZIO_HAS_ALLOCATOR(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); @@ -4536,6 +4736,7 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); + ASSERT(ZIO_HAS_ALLOCATOR(zio)); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); @@ -4551,7 +4752,7 @@ zio_done(zio_t *zio) if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); - ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, + ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && @@ -4662,7 +4863,8 @@ zio_done(zio_t *zio) * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ - spa_log_error(zio->io_spa, &zio->io_bookmark); + spa_log_error(zio->io_spa, &zio->io_bookmark, + BP_GET_LOGICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } @@ -4797,15 +4999,14 @@ zio_done(zio_t *zio) * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ - ASSERT(taskq_empty_ent(&zio->io_tqent)); - spa_taskq_dispatch_ent(zio->io_spa, + spa_taskq_dispatch(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, - zio_reexecute, zio, 0, &zio->io_tqent); + zio_reexecute, zio, B_FALSE); } return (NULL); } - ASSERT(zio->io_child_count == 0); + ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); @@ -4820,12 +5021,6 @@ zio_done(zio_t *zio) zfs_ereport_free_checksum(zcr); } - if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && - !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && - !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { - metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); - } - /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as @@ -4878,6 +5073,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_encrypt, zio_checksum_generate, zio_nop_write, + zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, @@ -4998,7 +5194,7 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp, { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; - ASSERT(last_block->zb_level == 0); + ASSERT0(last_block->zb_level); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) @@ -5024,26 +5220,41 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp, last_block) <= 0); } +/* + * This function is similar to zbookmark_subtree_completed(), but returns true + * if subtree_root is equal or ahead of last_block, i.e. still to be done. + */ +boolean_t +zbookmark_subtree_tbd(const dnode_phys_t *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) +{ + ASSERT0(last_block->zb_level); + if (dnp == NULL) + return (B_FALSE); + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root, + last_block) >= 0); +} + EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW, "Defer frees starting in this pass"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW, "Don't compress starting in this pass"); -ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, @@ -5051,4 +5262,3 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c index e6b5c9588939..ce6772a40c8b 100644 --- a/sys/contrib/openzfs/module/zfs/zio_checksum.c +++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -91,29 +91,29 @@ * invocation and passed to the checksum function. */ -/*ARGSUSED*/ static void abd_checksum_off(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + (void) abd, (void) size, (void) ctx_template; ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } -/*ARGSUSED*/ static void abd_fletcher_2_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + (void) ctx_template; fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_2_incremental_native, zcp); } -/*ARGSUSED*/ static void abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + (void) ctx_template; fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_2_incremental_byteswap, zcp); @@ -127,11 +127,11 @@ abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp) fletcher_4_abd_ops.acf_fini(acdp); } -/*ARGSUSED*/ void abd_fletcher_4_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + (void) ctx_template; fletcher_4_ctx_t ctx; zio_abd_checksum_data_t acd = { @@ -144,11 +144,11 @@ abd_fletcher_4_native(abd_t *abd, uint64_t size, } -/*ARGSUSED*/ void abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { + (void) ctx_template; fletcher_4_ctx_t ctx; zio_abd_checksum_data_t acd = { @@ -165,10 +165,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "on"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, @@ -177,14 +177,14 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { NULL, NULL, 0, "fletcher2"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, - {{abd_checksum_SHA256, abd_checksum_SHA256}, + {{abd_checksum_sha256, abd_checksum_sha256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, - {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, + {{abd_checksum_sha512_native, abd_checksum_sha512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, @@ -195,6 +195,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, + {{abd_checksum_blake3_native, abd_checksum_blake3_byteswap}, + abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}, }; /* @@ -207,6 +211,8 @@ zio_checksum_to_feature(enum zio_checksum cksum) VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); switch (cksum) { + case ZIO_CHECKSUM_BLAKE3: + return (SPA_FEATURE_BLAKE3); case ZIO_CHECKSUM_SHA512: return (SPA_FEATURE_SHA512); case ZIO_CHECKSUM_SKEIN: @@ -266,7 +272,7 @@ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t txg = BP_PHYSICAL_BIRTH(bp); + uint64_t txg = BP_GET_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); @@ -351,17 +357,20 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, zio_eck_t eck; size_t eck_offset; - bzero(&saved, sizeof (zio_cksum_t)); + memset(&saved, 0, sizeof (zio_cksum_t)); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t zilc; abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); - size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ, - uint64_t); + uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused, + ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(size, >=, nused); + size = nused; eck = zilc.zc_eck; eck_offset = offsetof(zil_chain_t, zc_eck); } else { + ASSERT3U(size, >=, sizeof (zio_eck_t)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); @@ -417,6 +426,9 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, zio_checksum_template_init(checksum, spa); + IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED); + IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL); + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_cksum_t verifier; size_t eck_offset; @@ -439,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, return (SET_ERROR(ECKSUM)); } - if (nused > size) { + nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + if (size < nused) return (SET_ERROR(ECKSUM)); - } - - size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + size = nused; } else { + if (size < sizeof (zio_eck_t)) + return (SET_ERROR(ECKSUM)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); @@ -506,8 +519,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, } if (info != NULL) { - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c index 1ff1e76d7f22..c8a10db7483b 100644 --- a/sys/contrib/openzfs/module/zfs/zio_compress.c +++ b/sys/contrib/openzfs/module/zfs/zio_compress.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -44,7 +44,7 @@ * If nonzero, every 1/X decompression attempts will fail, simulating * an undetected memory error. */ -unsigned long zio_decompress_fail_fraction = 0; +static unsigned long zio_decompress_fail_fraction = 0; /* * Compression vectors. @@ -66,7 +66,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"gzip-9", 9, gzip_compress, gzip_decompress, NULL}, {"zle", 64, zle_compress, zle_decompress, NULL}, {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, - {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress, + {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress_wrap, zfs_zstd_decompress, zfs_zstd_decompress_level}, }; @@ -74,6 +74,7 @@ uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child, uint8_t parent) { + (void) spa; uint8_t result; if (!ZIO_COMPRESS_HASLEVEL(compress)) @@ -110,10 +111,11 @@ zio_compress_select(spa_t *spa, enum zio_compress child, return (result); } -/*ARGSUSED*/ static int zio_compress_zeroed_cb(void *data, size_t len, void *private) { + (void) private; + uint64_t *end = (uint64_t *)((char *)data + len); for (uint64_t *word = (uint64_t *)data; word < end; word++) if (*word != 0) @@ -123,7 +125,7 @@ zio_compress_zeroed_cb(void *data, size_t len, void *private) } size_t -zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, +zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len, uint8_t level) { size_t c_len, d_len; @@ -161,9 +163,12 @@ zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT); } + if (*dst == NULL) + *dst = zio_buf_alloc(s_len); + /* No compression algorithms can read from ABDs directly */ void *tmp = abd_borrow_buf_copy(src, s_len); - c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel); + c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel); abd_return_buf(src, tmp, s_len); if (c_len > d_len) diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c index feaf41dc65e3..012a0e3c6c17 100644 --- a/sys/contrib/openzfs/module/zfs/zio_inject.c +++ b/sys/contrib/openzfs/module/zfs/zio_inject.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2024, Klara Inc. */ /* @@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0; typedef struct inject_handler { int zi_id; spa_t *zi_spa; + char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ zinject_record_t zi_record; uint64_t *zi_lanes; int zi_next_lane; @@ -148,7 +150,8 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && - (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && + (record->zi_dvas == 0 || + (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && error == record->zi_error) { return (freq_triggered(record->zi_freq)); } @@ -161,7 +164,7 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, * specified by tag. */ void -zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) +zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) { inject_handler_t *handler; @@ -341,15 +344,14 @@ zio_handle_label_injection(zio_t *zio, int error) return (ret); } -/*ARGSUSED*/ static int zio_inject_bitflip_cb(void *data, size_t len, void *private) { - zio_t *zio __maybe_unused = private; + zio_t *zio = private; uint8_t *buffer = data; uint_t byte = random_in_range(len); - ASSERT(zio->io_type == ZIO_TYPE_READ); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); /* flip a single random bit in an abd data buffer */ buffer[byte] ^= 1 << random_in_range(8); @@ -364,10 +366,10 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) int ret = 0; /* - * We skip over faults in the labels unless it's during - * device open (i.e. zio == NULL). + * We skip over faults in the labels unless it's during device open + * (i.e. zio == NULL) or a device flush (offset is meaningless) */ - if (zio != NULL) { + if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || @@ -605,6 +607,12 @@ zio_handle_io_delay(zio_t *zio) if (vd->vdev_guid != handler->zi_record.zi_guid) continue; + /* also match on I/O type (e.g., -T read) */ + if (handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) { + continue; + } + /* * Defensive; should never happen as the array allocation * occurs prior to inserting this handler on the list. @@ -699,6 +707,63 @@ zio_handle_io_delay(zio_t *zio) return (min_target); } +static void +zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) +{ + inject_handler_t *handler; + hrtime_t delay = 0; + int id = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); + handler != NULL && handler->zi_record.zi_cmd == command; + handler = list_next(&inject_handlers, handler)) { + ASSERT3P(handler->zi_spa_name, !=, NULL); + if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { + uint64_t pause = + SEC2NSEC(handler->zi_record.zi_duration); + if (pause > elapsed) { + delay = pause - elapsed; + } + id = handler->zi_id; + break; + } + } + + rw_exit(&inject_lock); + + if (delay) { + if (command == ZINJECT_DELAY_IMPORT) { + spa_import_progress_set_notes(spa, "injecting %llu " + "sec delay", (u_longlong_t)NSEC2SEC(delay)); + } + zfs_sleep_until(gethrtime() + delay); + } + if (id) { + /* all done with this one-shot handler */ + zio_clear_fault(id); + } +} + +/* + * For testing, inject a delay during an import + */ +void +zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); +} + +/* + * For testing, inject a delay during an export + */ +void +zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) +{ + zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); +} + static int zio_calculate_range(const char *pool, zinject_record_t *record) { @@ -756,6 +821,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record) return (0); } +static boolean_t +zio_pool_handler_exists(const char *name, zinject_type_t command) +{ + boolean_t exists = B_FALSE; + + rw_enter(&inject_lock, RW_READER); + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { + if (command != handler->zi_record.zi_cmd) + continue; + + const char *pool = (handler->zi_spa_name != NULL) ? + handler->zi_spa_name : spa_name(handler->zi_spa); + if (strcmp(name, pool) == 0) { + exists = B_TRUE; + break; + } + } + rw_exit(&inject_lock); + + return (exists); +} /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, @@ -806,16 +893,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) if (!(flags & ZINJECT_NULL)) { /* - * spa_inject_ref() will add an injection reference, which will - * prevent the pool from being removed from the namespace while - * still allowing it to be unloaded. + * Pool delays for import or export don't take an + * injection reference on the spa. Instead they + * rely on matching by name. */ - if ((spa = spa_inject_addref(name)) == NULL) - return (SET_ERROR(ENOENT)); + if (record->zi_cmd == ZINJECT_DELAY_IMPORT || + record->zi_cmd == ZINJECT_DELAY_EXPORT) { + if (record->zi_duration <= 0) + return (SET_ERROR(EINVAL)); + /* + * Only one import | export delay handler per pool. + */ + if (zio_pool_handler_exists(name, record->zi_cmd)) + return (SET_ERROR(EEXIST)); + + mutex_enter(&spa_namespace_lock); + boolean_t has_spa = spa_lookup(name) != NULL; + mutex_exit(&spa_namespace_lock); + + if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) + return (SET_ERROR(EEXIST)); + if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) + return (SET_ERROR(ENOENT)); + spa = NULL; + } else { + /* + * spa_inject_ref() will add an injection reference, + * which will prevent the pool from being removed + * from the namespace while still allowing it to be + * unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (SET_ERROR(ENOENT)); + } handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); - - handler->zi_spa = spa; + handler->zi_spa = spa; /* note: can be NULL */ handler->zi_record = *record; if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { @@ -828,6 +941,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) handler->zi_next_lane = 0; } + if (handler->zi_spa == NULL) + handler->zi_spa_name = spa_strdup(name); + else + handler->zi_spa_name = NULL; + rw_enter(&inject_lock, RW_WRITER); /* @@ -887,7 +1005,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen, if (handler) { *record = handler->zi_record; *id = handler->zi_id; - (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ASSERT(handler->zi_spa || handler->zi_spa_name); + if (handler->zi_spa != NULL) + (void) strlcpy(name, spa_name(handler->zi_spa), buflen); + else + (void) strlcpy(name, handler->zi_spa_name, buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); @@ -937,7 +1059,11 @@ zio_clear_fault(int id) ASSERT3P(handler->zi_lanes, ==, NULL); } - spa_inject_delref(handler->zi_spa); + if (handler->zi_spa_name != NULL) + spa_strfree(handler->zi_spa_name); + + if (handler->zi_spa != NULL) + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); diff --git a/sys/contrib/openzfs/module/zfs/zle.c b/sys/contrib/openzfs/module/zfs/zle.c index 0decebb13ca7..1483a65af803 100644 --- a/sys/contrib/openzfs/module/zfs/zle.c +++ b/sys/contrib/openzfs/module/zfs/zle.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/zfs/zrlock.c b/sys/contrib/openzfs/module/zfs/zrlock.c index a4def6053622..0d50cc4712ca 100644 --- a/sys/contrib/openzfs/module/zfs/zrlock.c +++ b/sys/contrib/openzfs/module/zfs/zrlock.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -106,16 +106,16 @@ zrl_add_impl(zrlock_t *zrl, const char *zc) void zrl_remove(zrlock_t *zrl) { - uint32_t n; - #ifdef ZFS_DEBUG if (zrl->zr_owner == curthread) { zrl->zr_owner = NULL; zrl->zr_caller = NULL; } + int32_t n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); + ASSERT3S(n, >=, 0); +#else + atomic_dec_32((uint32_t *)&zrl->zr_refcount); #endif - n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); - ASSERT3S((int32_t)n, >=, 0); } int diff --git a/sys/contrib/openzfs/module/zfs/zthr.c b/sys/contrib/openzfs/module/zfs/zthr.c index 33fdda7b68d1..02b9f0805dd7 100644 --- a/sys/contrib/openzfs/module/zfs/zthr.c +++ b/sys/contrib/openzfs/module/zfs/zthr.c @@ -231,7 +231,7 @@ struct zthr { const char *zthr_name; }; -static void +static __attribute__((noreturn)) void zthr_procedure(void *arg) { zthr_t *t = arg; @@ -469,6 +469,12 @@ zthr_iscancelled(zthr_t *t) return (cancelled); } +boolean_t +zthr_iscurthread(zthr_t *t) +{ + return (t->zthr_thread == curthread); +} + /* * Wait for the zthr to finish its current function. Similar to * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index d50cce7d7357..5b6a3f5cb410 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -90,9 +90,8 @@ unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; -list_t zvol_state_list; +static list_t zvol_state_list; krwlock_t zvol_state_lock; -const zvol_platform_ops_t *ops; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, @@ -112,13 +111,10 @@ typedef struct { uint64_t zvol_name_hash(const char *name) { - int i; uint64_t crc = -1ULL; - const uint8_t *p = (const uint8_t *)name; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { + for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; - } return (crc); } @@ -139,8 +135,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); mutex_enter(&zv->zv_state_lock); - if (zv->zv_hash == hash && - strncmp(zv->zv_name, name, MAXNAMELEN) == 0) { + if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) { /* * this is the right zvol, take the locks in the * right order @@ -155,8 +150,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) * to hold zvol_state_lock */ ASSERT(zv->zv_hash == hash && - strncmp(zv->zv_name, name, MAXNAMELEN) - == 0); + strcmp(zv->zv_name, name) == 0); } rw_exit(&zvol_state_lock); return (zv); @@ -365,12 +359,46 @@ out: mutex_exit(&zv->zv_state_lock); if (error == 0 && zv != NULL) - ops->zv_update_volsize(zv, volsize); + zvol_os_update_volsize(zv, volsize); return (SET_ERROR(error)); } /* + * Update volthreading. + */ +int +zvol_set_volthreading(const char *name, boolean_t value) +{ + zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); + if (zv == NULL) + return (ENOENT); + zv->zv_threading = value; + mutex_exit(&zv->zv_state_lock); + return (0); +} + +/* + * Update zvol ro property. + */ +int +zvol_set_ro(const char *name, boolean_t value) +{ + zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); + if (zv == NULL) + return (-1); + if (value) { + zvol_os_set_disk_ro(zv, 1); + zv->zv_flags |= ZVOL_RDONLY; + } else { + zvol_os_set_disk_ro(zv, 0); + zv->zv_flags &= ~ZVOL_RDONLY; + } + mutex_exit(&zv->zv_state_lock); + return (0); +} + +/* * Sanity check volume block size. */ int @@ -418,6 +446,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) lr_truncate_t *lr = arg2; uint64_t offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -430,7 +460,7 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) if (error != 0) { dmu_tx_abort(tx); } else { - zil_replaying(zv->zv_zilog, tx); + (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); @@ -454,6 +484,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_t *tx; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -476,7 +508,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); - zil_replaying(zv->zv_zilog, tx); + (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); } @@ -486,6 +518,7 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { + (void) arg1, (void) arg2, (void) byteswap; return (SET_ERROR(ENOTSUP)); } @@ -493,7 +526,7 @@ zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ -zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { +zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ @@ -513,6 +546,10 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ + zvol_replay_err, /* TX_SETSAXATTR */ + zvol_replay_err, /* TX_RENAME_EXCHANGE */ + zvol_replay_err, /* TX_RENAME_WHITEOUT */ + zvol_replay_err, /* TX_CLONE_RANGE */ }; /* @@ -521,11 +558,11 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ -ssize_t zvol_immediate_write_sz = 32768; +static const ssize_t zvol_immediate_write_sz = 32768; void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, - uint64_t size, int sync) + uint64_t size, boolean_t commit) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; @@ -540,7 +577,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, else if (!spa_has_slogs(zilog->zl_spa) && size >= blocksize && blocksize > zvol_immediate_write_sz) write_state = WR_INDIRECT; - else if (sync) + else if (commit) write_state = WR_COPIED; else write_state = WR_NEED_COPY; @@ -575,7 +612,6 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; - itx->itx_sync = sync; (void) zil_itx_assign(zilog, itx, tx); @@ -592,8 +628,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ void -zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, - boolean_t sync) +zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len) { itx_t *itx; lr_truncate_t *lr; @@ -608,15 +643,14 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, lr->lr_offset = off; lr->lr_length = len; - itx->itx_sync = sync; zil_itx_assign(zilog, itx, tx); } -/* ARGSUSED */ static void zvol_get_done(zgd_t *zgd, int error) { + (void) error; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); @@ -640,10 +674,9 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, int error; ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* @@ -659,6 +692,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ + ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change @@ -669,8 +703,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); - error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, - DMU_READ_NO_PREFETCH); + error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd, + &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -746,15 +780,15 @@ zvol_setup_zv(zvol_state_t *zv) if (error) return (SET_ERROR(error)); - ops->zv_set_capacity(zv, volsize >> 9); + zvol_os_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { - ops->zv_set_disk_ro(zv, 1); + zvol_os_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { - ops->zv_set_disk_ro(zv, 0); + zvol_os_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); @@ -867,54 +901,26 @@ int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; - int error, locked = 0; - boolean_t ro; + int error; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(mutex_owned(&spa_namespace_lock)); - /* - * In all other cases the spa_namespace_lock is taken before the - * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() - * function calls fops->open() with the bdev->bd_mutex lock held. - * This deadlock can be easily observed with zvols used as vdevs. - * - * To avoid a potential lock inversion deadlock we preemptively - * try to take the spa_namespace_lock(). Normally it will not - * be contended and this is safe because spa_open_common() handles - * the case where the caller already holds the spa_namespace_lock. - * - * When it is contended we risk a lock inversion if we were to - * block waiting for the lock. Luckily, the __blkdev_get() - * function allows us to return -ERESTARTSYS which will result in - * bdev->bd_mutex being dropped, reacquired, and fops->open() being - * called again. This process can be repeated safely until both - * locks are acquired. - */ - if (!mutex_owned(&spa_namespace_lock)) { - locked = mutex_tryenter(&spa_namespace_lock); - if (!locked) - return (SET_ERROR(EINTR)); - } - - ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); + boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) - goto out_mutex; + return (SET_ERROR(error)); zv->zv_objset = os; error = zvol_setup_zv(zv); - if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } -out_mutex: - if (locked) - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(error)); + return (error); } void @@ -951,7 +957,7 @@ zvol_prefetch_minors_impl(void *arg) job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { - dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } @@ -1053,8 +1059,7 @@ zvol_add_clones(const char *dsname, list_t *minors_list) out: if (dd != NULL) dsl_dir_rele(dd, FTAG); - if (dp != NULL) - dsl_pool_rele(dp, FTAG); + dsl_pool_rele(dp, FTAG); } /* @@ -1102,7 +1107,7 @@ zvol_create_minors_cb(const char *dsname, void *arg) * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ - error = dmu_objset_find(dsname, + (void) dmu_objset_find(dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } @@ -1146,7 +1151,7 @@ zvol_create_minors_recursive(const char *name) * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * - * We will use this list to do zvol_create_minor_impl after prefetch + * We will use this list to do zvol_os_create_minor after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), @@ -1160,7 +1165,7 @@ zvol_create_minors_recursive(const char *name) &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) - (void) ops->zv_create_minor(name); + (void) zvol_os_create_minor(name); } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, @@ -1171,13 +1176,12 @@ zvol_create_minors_recursive(const char *name) taskq_wait_outstanding(system_taskq, 0); /* - * Prefetch is completed, we can do zvol_create_minor_impl + * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ - while ((job = list_head(&minors_list)) != NULL) { - list_remove(&minors_list, job); + while ((job = list_remove_head(&minors_list)) != NULL) { if (!job->error) - (void) ops->zv_create_minor(job->name); + (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } @@ -1207,9 +1211,9 @@ zvol_create_minor(const char *name) "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) - (void) ops->zv_create_minor(name); + (void) zvol_os_create_minor(name); } else { - (void) ops->zv_create_minor(name); + (void) zvol_os_create_minor(name); } } @@ -1220,7 +1224,7 @@ zvol_create_minor(const char *name) static void zvol_free_task(void *arg) { - ops->zv_free(arg); + zvol_os_free(arg); } void @@ -1265,7 +1269,7 @@ zvol_remove_minors_impl(const char *name) * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ - ops->zv_clear_private(zv); + zvol_os_clear_private(zv); /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); @@ -1282,10 +1286,8 @@ zvol_remove_minors_impl(const char *name) rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ - while ((zv = list_head(&free_list)) != NULL) { - list_remove(&free_list, zv); - ops->zv_free(zv); - } + while ((zv = list_remove_head(&free_list)) != NULL) + zvol_os_free(zv); } /* Remove minor for this specific volume only */ @@ -1317,7 +1319,7 @@ zvol_remove_minor_impl(const char *name) } zvol_remove(zv); - ops->zv_clear_private(zv); + zvol_os_clear_private(zv); mutex_exit(&zv->zv_state_lock); break; } else { @@ -1329,7 +1331,7 @@ zvol_remove_minor_impl(const char *name) rw_exit(&zvol_state_lock); if (zv != NULL) - ops->zv_free(zv); + zvol_os_free(zv); } /* @@ -1339,13 +1341,12 @@ static void zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; - int oldnamelen, newnamelen; + int oldnamelen; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); - newnamelen = strlen(newname); rw_enter(&zvol_state_lock, RW_READER); @@ -1355,14 +1356,14 @@ zvol_rename_minors_impl(const char *oldname, const char *newname) mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { - ops->zv_rename_minor(zv, newname); + zvol_os_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); - ops->zv_rename_minor(zv, name); + zvol_os_rename_minor(zv, name); kmem_strfree(name); } @@ -1386,7 +1387,7 @@ zvol_set_snapdev_cb(const char *dsname, void *param) switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: - (void) ops->zv_create_minor(dsname); + (void) zvol_os_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); @@ -1443,14 +1444,14 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: (void) zvol_remove_minor_impl(name); - (void) ops->zv_create_minor(name); + (void) zvol_os_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: (void) zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ - (void) ops->zv_create_minor(name); + (void) zvol_os_create_minor(name); break; } spl_fstrans_unmark(cookie); @@ -1470,9 +1471,9 @@ zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, task->op = op; task->value = value; - strlcpy(task->name1, name1, MAXNAMELEN); + strlcpy(task->name1, name1, sizeof (task->name1)); if (name2 != NULL) - strlcpy(task->name2, name2, MAXNAMELEN); + strlcpy(task->name2, name2, sizeof (task->name2)); return (task); } @@ -1516,7 +1517,7 @@ typedef struct zvol_set_prop_int_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; - dmu_tx_t *zsda_tx; + zfs_prop_t zsda_prop; } zvol_set_prop_int_arg_t; /* @@ -1524,7 +1525,7 @@ typedef struct zvol_set_prop_int_arg { * conditions are imposed. */ static int -zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) +zvol_set_common_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); @@ -1540,104 +1541,34 @@ zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) return (error); } -/* ARGSUSED */ static int -zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { - char dsname[MAXNAMELEN]; + zvol_set_prop_int_arg_t *zsda = arg; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; zvol_task_t *task; - uint64_t snapdev; + uint64_t prop; + const char *prop_name = zfs_prop_to_name(zsda->zsda_prop); dsl_dataset_name(ds, dsname); - if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0) - return (0); - task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev); - if (task == NULL) - return (0); - - (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, - task, TQ_SLEEP); - return (0); -} - -/* - * Traverse all child datasets and apply snapdev appropriately. - * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel - * dataset and read the effective "snapdev" on every child in the callback - * function: this is because the value is not guaranteed to be the same in the - * whole dataset hierarchy. - */ -static void -zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) -{ - zvol_set_prop_int_arg_t *zsda = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dir_t *dd; - dsl_dataset_t *ds; - int error; - VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); - zsda->zsda_tx = tx; + if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0) + return (0); - error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); - if (error == 0) { - dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), - zsda->zsda_source, sizeof (zsda->zsda_value), 1, - &zsda->zsda_value, zsda->zsda_tx); - dsl_dataset_rele(ds, FTAG); + switch (zsda->zsda_prop) { + case ZFS_PROP_VOLMODE: + task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, + NULL, prop); + break; + case ZFS_PROP_SNAPDEV: + task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, + NULL, prop); + break; + default: + task = NULL; + break; } - dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, - zsda, DS_FIND_CHILDREN); - dsl_dir_rele(dd, FTAG); -} - -int -zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) -{ - zvol_set_prop_int_arg_t zsda; - - zsda.zsda_name = ddname; - zsda.zsda_source = source; - zsda.zsda_value = snapdev; - - return (dsl_sync_task(ddname, zvol_set_snapdev_check, - zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); -} - -/* - * Sanity check the dataset for safe use by the sync task. No additional - * conditions are imposed. - */ -static int -zvol_set_volmode_check(void *arg, dmu_tx_t *tx) -{ - zvol_set_prop_int_arg_t *zsda = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dir_t *dd; - int error; - - error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); - if (error != 0) - return (error); - - dsl_dir_rele(dd, FTAG); - - return (error); -} - -/* ARGSUSED */ -static int -zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) -{ - char dsname[MAXNAMELEN]; - zvol_task_t *task; - uint64_t volmode; - - dsl_dataset_name(ds, dsname); - if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0) - return (0); - task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode); if (task == NULL) return (0); @@ -1647,14 +1578,14 @@ zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) } /* - * Traverse all child datasets and apply volmode appropriately. + * Traverse all child datasets and apply the property appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel - * dataset and read the effective "volmode" on every child in the callback + * dataset and read the effective "property" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void -zvol_set_volmode_sync(void *arg, dmu_tx_t *tx) +zvol_set_common_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); @@ -1663,33 +1594,34 @@ zvol_set_volmode_sync(void *arg, dmu_tx_t *tx) int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); - zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { - dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE), + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop), zsda->zsda_source, sizeof (zsda->zsda_value), 1, - &zsda->zsda_value, zsda->zsda_tx); + &zsda->zsda_value, tx); dsl_dataset_rele(ds, FTAG); } - dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb, + dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int -zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode) +zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source, + uint64_t val) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; - zsda.zsda_value = volmode; + zsda.zsda_value = val; + zsda.zsda_prop = prop; - return (dsl_sync_task(ddname, zvol_set_volmode_check, - zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); + return (dsl_sync_task(ddname, zvol_set_common_check, + zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void @@ -1727,13 +1659,7 @@ boolean_t zvol_is_zvol(const char *name) { - return (ops->zv_is_zvol(name)); -} - -void -zvol_register_ops(const zvol_platform_ops_t *zvol_ops) -{ - ops = zvol_ops; + return (zvol_os_is_zvol(name)); } int |