diff options
author | Martin Matuska <mm@FreeBSD.org> | 2022-05-18 22:54:40 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2022-05-18 22:55:59 +0000 |
commit | 716fd348e01c5f2ba125f878a634a753436c2994 (patch) | |
tree | 0d738baf7a9ccfd90fa1e622f67e0399f306f024 /sys/contrib/openzfs/module/zfs | |
parent | 4e2d3f26bd12610ef8672eefb02814b882a4c29b (diff) | |
parent | c0cf6ed6792e545fd614c2a88cb53756db7e03f8 (diff) | |
download | src-716fd348e01c5f2ba125f878a634a753436c2994.tar.gz src-716fd348e01c5f2ba125f878a634a753436c2994.zip |
zfs: merge openzfs/zfs@c0cf6ed67
Notable upstream pull request merges:
#10662 zvol_wait: Ignore locked zvols
#12789 Improve log spacemap load time
#12812 Improved zpool status output, list all affected datasets
#13277 FreeBSD: Use NDFREE_PNBUF if available
#13302 Make zfs_max_recordsize default to 16M
#13311 Fix error handling in FreeBSD's get/putpages VOPs
#13345 FreeBSD: Fix translation from ABD to physical pages
#13373 zfs: holds: dequadratify
#13375 Corrected edge case in uncompressed ARC->L2ARC handling
#13388 Improve mg_aliquot math
#13405 Reduce dbuf_find() lock contention
#13406 FreeBSD: use zero_region instead of allocating a dedicated page
Obtained from: OpenZFS
OpenZFS commit: c0cf6ed6792e545fd614c2a88cb53756db7e03f8
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
19 files changed, 1202 insertions, 432 deletions
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in deleted file mode 100644 index 30dc91a7eb59..000000000000 --- a/sys/contrib/openzfs/module/zfs/Makefile.in +++ /dev/null @@ -1,158 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -mfdir = $(obj) -else -mfdir = $(srctree)/$(src) -endif - -MODULE := zfs - -obj-$(CONFIG_ZFS) := $(MODULE).o - -# Suppress unused-value warnings in sparc64 architecture headers -ccflags-$(CONFIG_SPARC64) += -Wno-unused-value - -$(MODULE)-objs += abd.o -$(MODULE)-objs += aggsum.o -$(MODULE)-objs += arc.o -$(MODULE)-objs += blkptr.o -$(MODULE)-objs += bplist.o -$(MODULE)-objs += bpobj.o -$(MODULE)-objs += bptree.o -$(MODULE)-objs += btree.o -$(MODULE)-objs += bqueue.o -$(MODULE)-objs += dataset_kstats.o -$(MODULE)-objs += dbuf.o -$(MODULE)-objs += dbuf_stats.o -$(MODULE)-objs += ddt.o -$(MODULE)-objs += ddt_zap.o -$(MODULE)-objs += dmu.o -$(MODULE)-objs += dmu_diff.o -$(MODULE)-objs += dmu_object.o -$(MODULE)-objs += dmu_objset.o -$(MODULE)-objs += dmu_recv.o -$(MODULE)-objs += dmu_redact.o -$(MODULE)-objs += dmu_send.o -$(MODULE)-objs += dmu_traverse.o -$(MODULE)-objs += dmu_tx.o -$(MODULE)-objs += dmu_zfetch.o -$(MODULE)-objs += dnode.o -$(MODULE)-objs += dnode_sync.o -$(MODULE)-objs += dsl_bookmark.o -$(MODULE)-objs += dsl_crypt.o -$(MODULE)-objs += dsl_dataset.o -$(MODULE)-objs += dsl_deadlist.o -$(MODULE)-objs += dsl_deleg.o -$(MODULE)-objs += dsl_destroy.o -$(MODULE)-objs += dsl_dir.o -$(MODULE)-objs += dsl_pool.o -$(MODULE)-objs += dsl_prop.o -$(MODULE)-objs += dsl_scan.o -$(MODULE)-objs += dsl_synctask.o -$(MODULE)-objs += dsl_userhold.o -$(MODULE)-objs += edonr_zfs.o -$(MODULE)-objs += fm.o -$(MODULE)-objs += gzip.o -$(MODULE)-objs += hkdf.o -$(MODULE)-objs += lz4.o -$(MODULE)-objs += lz4_zfs.o -$(MODULE)-objs += lzjb.o -$(MODULE)-objs += metaslab.o -$(MODULE)-objs += mmp.o -$(MODULE)-objs += multilist.o -$(MODULE)-objs += objlist.o -$(MODULE)-objs += pathname.o -$(MODULE)-objs += range_tree.o -$(MODULE)-objs += refcount.o -$(MODULE)-objs += rrwlock.o -$(MODULE)-objs += sa.o -$(MODULE)-objs += sha256.o -$(MODULE)-objs += skein_zfs.o -$(MODULE)-objs += spa.o -$(MODULE)-objs += spa_boot.o -$(MODULE)-objs += spa_checkpoint.o -$(MODULE)-objs += spa_config.o -$(MODULE)-objs += spa_errlog.o -$(MODULE)-objs += spa_history.o -$(MODULE)-objs += spa_log_spacemap.o -$(MODULE)-objs += spa_misc.o -$(MODULE)-objs += spa_stats.o -$(MODULE)-objs += space_map.o -$(MODULE)-objs += space_reftree.o -$(MODULE)-objs += txg.o -$(MODULE)-objs += uberblock.o -$(MODULE)-objs += unique.o -$(MODULE)-objs += vdev.o -$(MODULE)-objs += vdev_cache.o -$(MODULE)-objs += vdev_draid.o -$(MODULE)-objs += vdev_draid_rand.o -$(MODULE)-objs += vdev_indirect.o -$(MODULE)-objs += vdev_indirect_births.o -$(MODULE)-objs += vdev_indirect_mapping.o -$(MODULE)-objs += vdev_initialize.o -$(MODULE)-objs += vdev_label.o -$(MODULE)-objs += vdev_mirror.o -$(MODULE)-objs += vdev_missing.o -$(MODULE)-objs += vdev_queue.o -$(MODULE)-objs += vdev_raidz.o -$(MODULE)-objs += vdev_raidz_math.o -$(MODULE)-objs += vdev_raidz_math_scalar.o -$(MODULE)-objs += vdev_rebuild.o -$(MODULE)-objs += vdev_removal.o -$(MODULE)-objs += vdev_root.o -$(MODULE)-objs += vdev_trim.o -$(MODULE)-objs += zap.o -$(MODULE)-objs += zap_leaf.o -$(MODULE)-objs += zap_micro.o -$(MODULE)-objs += zcp.o -$(MODULE)-objs += zcp_get.o -$(MODULE)-objs += zcp_global.o -$(MODULE)-objs += zcp_iter.o -$(MODULE)-objs += zcp_set.o -$(MODULE)-objs += zcp_synctask.o -$(MODULE)-objs += zfeature.o -$(MODULE)-objs += zfs_byteswap.o -$(MODULE)-objs += zfs_fm.o -$(MODULE)-objs += zfs_fuid.o -$(MODULE)-objs += zfs_ioctl.o -$(MODULE)-objs += zfs_log.o -$(MODULE)-objs += zfs_onexit.o -$(MODULE)-objs += zfs_quota.o -$(MODULE)-objs += zfs_ratelimit.o -$(MODULE)-objs += zfs_replay.o -$(MODULE)-objs += zfs_rlock.o -$(MODULE)-objs += zfs_sa.o -$(MODULE)-objs += zfs_vnops.o -$(MODULE)-objs += zil.o -$(MODULE)-objs += zio.o -$(MODULE)-objs += zio_checksum.o -$(MODULE)-objs += zio_compress.o -$(MODULE)-objs += zio_inject.o -$(MODULE)-objs += zle.o -$(MODULE)-objs += zrlock.o -$(MODULE)-objs += zthr.o -$(MODULE)-objs += zvol.o - -# Suppress incorrect warnings from versions of objtool which are not -# aware of x86 EVEX prefix instructions used for AVX512. -OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y -OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y - -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o - -$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o -$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o - -$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o -$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o - -ifeq ($(CONFIG_ALTIVEC),y) -$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec -endif - -include $(mfdir)/../os/linux/zfs/Makefile diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 79e754c4abcb..af42670cc2c9 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -9337,26 +9337,37 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - cabd = abd_alloc_for_io(asize, ismd); - tmp = abd_borrow_buf(cabd, asize); + /* + * In some cases, we can wind up with size > asize, so + * we need to opt for the larger allocation option here. + * + * (We also need abd_return_buf_copy in all cases because + * it's an ASSERT() to modify the buffer before returning it + * with arc_return_buf(), and all the compressors + * write things before deciding to fail compression in nearly + * every case.) + */ + cabd = abd_alloc_for_io(size, ismd); + tmp = abd_borrow_buf(cabd, size); psize = zio_compress_data(compress, to_write, tmp, size, hdr->b_complevel); - if (psize >= size) { - abd_return_buf(cabd, tmp, asize); + if (psize >= asize) { + psize = HDR_GET_PSIZE(hdr); + abd_return_buf_copy(cabd, tmp, size); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; - abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); - if (size != asize) - abd_zero_off(to_write, size, asize - size); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); + if (psize != asize) + abd_zero_off(to_write, psize, asize - psize); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) memset((char *)tmp + psize, 0, asize - psize); psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, asize); + abd_return_buf_copy(cabd, tmp, size); to_write = cabd; } @@ -11045,20 +11056,20 @@ EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, - param_get_long, ZMOD_RW, "Min arc size"); + param_get_long, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, - param_get_long, ZMOD_RW, "Max arc size"); + param_get_long, ZMOD_RW, "Maximum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, - param_get_long, ZMOD_RW, "Metadata limit for arc size"); + param_get_long, ZMOD_RW, "Metadata limit for ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, - "Percent of arc size for arc meta limit"); + "Percent of ARC size for ARC meta limit"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, - param_get_long, ZMOD_RW, "Min arc metadata"); + param_get_long, ZMOD_RW, "Minimum ARC metadata size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, "Meta objects to scan for prune"); @@ -11070,16 +11081,16 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW, "Meta reclaim strategy"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, - param_get_int, ZMOD_RW, "Seconds before growing arc size"); + param_get_int, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, "Disable arc_p adapt dampener"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, - param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)"); + param_get_int, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, - "Percent of pagecache to reclaim arc to"); + "Percent of pagecache to reclaim ARC to"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p"); @@ -11088,7 +11099,7 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, - "Disable compressed arc buffers"); + "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); @@ -11149,7 +11160,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, param_get_long, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, - param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc"); + param_get_long, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index ee2470b38606..9a273b010fb1 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -339,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - mutex_enter(DBUF_HASH_MUTEX(h, idx)); + rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); return (NULL); } @@ -393,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - mutex_enter(DBUF_HASH_MUTEX(h, idx)); + rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); @@ -417,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); DBUF_STAT_MAX(hash_elements_max, he); @@ -474,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db) /* * We mustn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_MUTEX > db_mtx. + * DBUF_HASH_RWLOCK > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); - mutex_enter(DBUF_HASH_MUTEX(h, idx)); + rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; @@ -491,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } @@ -914,8 +914,8 @@ retry: sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + for (i = 0; i < DBUF_RWLOCKS; i++) + rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL); dbuf_stats_init(h); @@ -981,8 +981,8 @@ dbuf_fini(void) dbuf_stats_destroy(); - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_destroy(&h->hash_mutexes[i]); + for (i = 0; i < DBUF_RWLOCKS; i++) + rw_destroy(&h->hash_rwlocks[i]); #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages @@ -3947,7 +3947,7 @@ dmu_buf_get_user(dmu_buf_t *db_fake) } void -dmu_buf_user_evict_wait() +dmu_buf_user_evict_wait(void) { taskq_wait(dbu_evict_taskq); } diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c index fa9a5f08060a..a42750ac8e90 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf_stats.c +++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c @@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) if (size) buf[0] = 0; - mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER); for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { /* * Returning ENOMEM will cause the data and header functions @@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) mutex_exit(&db->db_mtx); } - mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx)); return (error); } diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 461feeffb6a3..7d8b2c96bd74 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -86,7 +86,7 @@ static int zfs_dmu_offset_next_sync = 1; * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ -static int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index e836d681e920..ca894c35253c 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -73,12 +73,19 @@ * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, - * we do not allow the recordsize to be set larger than zfs_max_recordsize - * (default 1MB). Larger blocks can be created by changing this tunable, - * and pools with larger blocks can always be imported and used, regardless - * of this setting. + * we did not allow the recordsize to be set larger than zfs_max_recordsize + * (former default: 1MB). Larger blocks could be created by changing this + * tunable, and pools with larger blocks could always be imported and used, + * regardless of this setting. + * + * We do, however, still limit it by default to 1M on x86_32, because Linux's + * 3/1 memory split doesn't leave much room for 16M chunks. */ -int zfs_max_recordsize = 1 * 1024 * 1024; +#ifdef _ILP32 +int zfs_max_recordsize = 1 * 1024 * 1024; +#else +int zfs_max_recordsize = 16 * 1024 * 1024; +#endif static int zfs_allow_redacted_dataset_mount = 0; #define SWITCH64(x, y) \ @@ -3708,6 +3715,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); + + /* + * Transfer common error blocks from old head to new head. + */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) { + uint64_t old_head = origin_head->ds_object; + uint64_t new_head = hds->ds_object; + spa_swap_errlog(dp->dp_spa, new_head, old_head, tx); + } } /* @@ -4924,13 +4940,38 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; } -#if defined(_LP64) -#define RECORDSIZE_PERM ZMOD_RW -#else -/* Limited to 1M on 32-bit platforms due to lack of virtual address space */ -#define RECORDSIZE_PERM ZMOD_RD -#endif -ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, +/* + * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj + * dataset whose birth time is >= min_txg. + */ +int +dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg, + uint64_t *oldest_dsobj) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (prev_obj != 0 && min_txg < prev_obj_txg) { + dsl_dataset_rele(ds, FTAG); + if ((error = dsl_dataset_hold_obj(dp, prev_obj, + FTAG, &ds)) != 0) + return (error); + prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + *oldest_dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + return (0); +} + +ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, ZMOD_RW, "Max allowed record size"); ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c index b32929b3320c..7dddd8eed5e9 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c +++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c @@ -1153,6 +1153,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); dsl_dataset_rele(prev, FTAG); } + /* Delete errlog. */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) + spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx); } void diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 7ed83b305db7..ab32bfec1310 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -48,10 +48,10 @@ /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal - * operation, we will try to write this amount of data to a top-level vdev - * before moving on to the next one. + * operation, we will try to write this amount of data to each disk before + * moving on to the next top-level vdev. */ -static unsigned long metaslab_aliquot = 512 << 10; +static unsigned long metaslab_aliquot = 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. @@ -899,7 +899,8 @@ metaslab_group_activate(metaslab_group_t *mg) if (++mg->mg_activation_count <= 0) return; - mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + mg->mg_aliquot = metaslab_aliquot * MAX(1, + vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { @@ -2750,7 +2751,8 @@ metaslab_fini_flush_data(metaslab_t *msp) mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); - spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), + metaslab_unflushed_dirty(msp)); } uint64_t @@ -3728,50 +3730,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) metaslab_flush_update(msp, tx); } -/* - * Called when the metaslab has been flushed (its own spacemap now reflects - * all the contents of the pool-wide spacemap log). Updates the metaslab's - * metadata and any pool-wide related log space map data (e.g. summary, - * obsolete logs, etc..) to reflect that. - */ static void -metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { - metaslab_group_t *mg = msp->ms_group; - spa_t *spa = mg->mg_vd->vdev_spa; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - ASSERT3U(spa_sync_pass(spa), ==, 1); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - /* - * Just because a metaslab got flushed, that doesn't mean that - * it will pass through metaslab_sync_done(). Thus, make sure to - * update ms_synced_length here in case it doesn't. - */ - msp->ms_synced_length = space_map_length(msp->ms_sm); + mutex_enter(&spa->spa_flushed_ms_lock); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, B_TRUE); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); - /* - * We may end up here from metaslab_condense() without the - * feature being active. In that case this is a no-op. - */ - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa, B_TRUE); +} +void +metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); @@ -3779,17 +3776,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); + /* update log space map summary */ + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, + ms_prev_flushed_dirty); + spa_log_summary_add_flushed_metaslab(spa, dirty); + /* cleanup obsolete logs if any */ - uint64_t log_blocks_before = spa_log_sm_nblocks(spa); spa_cleanup_old_sm_logs(spa, tx); - uint64_t log_blocks_after = spa_log_sm_nblocks(spa); - VERIFY3U(log_blocks_after, <=, log_blocks_before); +} - /* update log space map summary */ - uint64_t blocks_gone = log_blocks_before - log_blocks_after; - spa_log_summary_add_flushed_metaslab(spa); - spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); - spa_log_summary_decrement_blkcount(spa, blocks_gone); +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc..) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || + metaslab_unflushed_txg(msp) == 0) + return; + + metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t @@ -4005,23 +4032,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT0(metaslab_allocated_space(msp)); } - if (metaslab_unflushed_txg(msp) == 0 && - spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { - ASSERT(spa_syncing_log_sm(spa) != NULL); - - metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); - spa_log_sm_increment_current_mscount(spa); - spa_log_summary_add_flushed_metaslab(spa); - - ASSERT(msp->ms_sm != NULL); - mutex_enter(&spa->spa_flushed_ms_lock); - avl_add(&spa->spa_metaslabs_by_flushed, msp); - mutex_exit(&spa->spa_flushed_ms_lock); - - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - } - if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); @@ -4069,6 +4079,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + if (metaslab_unflushed_txg(msp) == 0) + metaslab_unflushed_add(msp, tx); + else if (!metaslab_unflushed_dirty(msp)) + metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); @@ -6131,6 +6145,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) mutex_exit(&mg->mg_ms_disabled_lock); } +void +metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) +{ + ms->ms_unflushed_dirty = dirty; +} + static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { @@ -6167,15 +6187,16 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { - spa_t *spa = ms->ms_group->mg_vd->vdev_spa; - - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; - ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } +boolean_t +metaslab_unflushed_dirty(metaslab_t *ms) +{ + return (ms->ms_unflushed_dirty); +} + uint64_t metaslab_unflushed_txg(metaslab_t *ms) { diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c index 2b6776581a47..db8c2b831f1d 100644 --- a/sys/contrib/openzfs/module/zfs/sa.c +++ b/sys/contrib/openzfs/module/zfs/sa.c @@ -1068,8 +1068,8 @@ sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs, za.za_num_integers); break; } - VERIFY(ddi_strtoull(za.za_name, NULL, 10, - (unsigned long long *)&lot_num) == 0); + VERIFY0(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num)); (void) sa_add_layout_entry(os, lot_attrs, za.za_num_integers, lot_num, diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index e69cb5527be8..01114dedef48 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -4355,7 +4355,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) error = spa_ld_log_spacemaps(spa); if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", + spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c index c6b28ea7d1b8..9e5d1de63c0b 100644 --- a/sys/contrib/openzfs/module/zfs/spa_errlog.c +++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c @@ -20,7 +20,8 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014, Delphix. All rights reserved. + * Copyright (c) 2021, George Amanakis. All rights reserved. */ /* @@ -43,6 +44,16 @@ * calculation when the data is requested, storing the result so future queries * will be faster. * + * If the head_errlog feature is enabled, a different on-disk format is used. + * The error log of each head dataset is stored separately in the zap object + * and keyed by the head id. This enables listing every dataset affected in + * userland. In order to be able to track whether an error block has been + * modified or added to snapshots since it was marked as an error, a new tuple + * is introduced: zbookmark_err_phys_t. It allows the storage of the birth + * transaction group of an error block on-disk. The birth transaction group is + * used by check_filesystem() to assess whether this block was freed, + * re-written or added to a snapshot since its marking as an error. + * * This log is then shipped into an nvlist where the key is the dataset name and * the value is the object name. Userland is then responsible for uniquifying * this list and displaying it to the user. @@ -53,7 +64,17 @@ #include <sys/spa_impl.h> #include <sys/zap.h> #include <sys/zio.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_objset.h> +#include <sys/dbuf.h> +/* + * spa_upgrade_errlog_limit : A zfs module parameter that controls the number + * of on-disk error log entries that will be converted to the new + * format when enabling head_errlog. Defaults to 0 which converts + * all log entries. + */ +static uint32_t spa_upgrade_errlog_limit = 0; /* * Convert a bookmark to a string. @@ -67,9 +88,35 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) } /* - * Convert a string to a bookmark + * Convert an err_phys to a string. + */ +static void +errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, + (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); +} + +/* + * Convert a string to a err_phys. + */ +static void +name_to_errphys(char *buf, zbookmark_err_phys_t *zep) +{ + zep->zb_object = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_birth = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +/* + * Convert a string to a bookmark. */ -#ifdef _KERNEL static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { @@ -82,8 +129,74 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } + +#ifdef _KERNEL +static void +zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) +{ + zb->zb_objset = dataset; + zb->zb_object = zep->zb_object; + zb->zb_level = zep->zb_level; + zb->zb_blkid = zep->zb_blkid; +} #endif +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static int +get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, + uint64_t *head_dataset_id) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *ds; + objset_t *os; + + dsl_pool_config_enter(dp, FTAG); + int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); + if (error != 0) { + dsl_pool_config_exit(dp, FTAG); + return (error); + } + ASSERT(head_dataset_id); + *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (error); + } + + dnode_t *dn; + blkptr_t bp; + + error = dnode_hold(os, zep->zb_object, FTAG, &dn); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (error); + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (error == 0 && BP_IS_HOLE(&bp)) + error = SET_ERROR(ENOENT); + + zep->zb_birth = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (error); +} + /* * Log an uncorrectable error to the persistent error log. We add it to the * spa's list of pending errors. The changes are actually synced out to disk @@ -128,6 +241,276 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) mutex_exit(&spa->spa_errlist_lock); } +#ifdef _KERNEL +static int +find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg) +{ + objset_t *os; + int error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + dnode_t *dn; + blkptr_t bp; + + error = dnode_hold(os, zep->zb_object, FTAG, &dn); + if (error != 0) + return (error); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (error == 0 && BP_IS_HOLE(&bp)) + error = SET_ERROR(ENOENT); + + *birth_txg = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); +} + +/* + * This function serves a double role. If only_count is true, it returns + * (in *count) how many times an error block belonging to this filesystem is + * referenced by snapshots or clones. If only_count is false, each time the + * error block is referenced by a snapshot or clone, it fills the userspace + * array at uaddr with the bookmarks of the error blocks. The array is filled + * from the back and *count is modified to be the number of unused entries at + * the beginning of the array. + */ +static int +check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *count, void *uaddr, boolean_t only_count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + error = find_birth_txg(ds, zep, &latest_txg); + if (error == 0) { + if (zep->zb_birth == latest_txg) { + /* Block neither free nor rewritten. */ + if (!only_count) { + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + if (copyout(&zb, (char *)uaddr + (*count - 1) + * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + (*count)--; + } else { + (*count)++; + } + check_snapshot = B_FALSE; + } else { + ASSERT3U(zep->zb_birth, <, latest_txg); + txg_to_consider = latest_txg; + } + } + + /* How many snapshots reference this block. */ + uint64_t snap_count; + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (snap_count == 0) { + /* File system has no snapshot. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), + KM_SLEEP); + + int aff_snap_count = 0; + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + dsl_dataset_rele(ds, FTAG); + error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + if (error != 0) + goto out; + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) + break; + + boolean_t affected = B_TRUE; + if (check_snapshot) { + uint64_t blk_txg; + error = find_birth_txg(ds, zep, &blk_txg); + affected = (error == 0 && zep->zb_birth == blk_txg); + } + + if (affected) { + snap_obj_array[aff_snap_count] = snap_obj; + aff_snap_count++; + + if (!only_count) { + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + if (copyout(&zb, (char *)uaddr + (*count - 1) * + sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + dsl_dataset_rele(ds, FTAG); + error = SET_ERROR(EFAULT); + goto out; + } + (*count)--; + } else { + (*count)++; + } + + /* + * Only clones whose origins were affected could also + * have affected snapshots. + */ + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + error = check_filesystem(spa, + za.za_first_integer, zep, + count, uaddr, only_count); + + if (error != 0) { + zap_cursor_fini(&zc); + goto out; + } + } + zap_cursor_fini(&zc); + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + +out: + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); + return (error); +} + +static int +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *top_affected_fs) +{ + uint64_t oldest_dsobj; + int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, + &oldest_dsobj); + if (error != 0) + return (error); + + dsl_dataset_t *ds; + error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, + FTAG, &ds); + if (error != 0) + return (error); + + *top_affected_fs = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static int +process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *count, void *uaddr, boolean_t only_count) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_pool_config_enter(dp, FTAG); + uint64_t top_affected_fs; + + int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); + if (error == 0) + error = check_filesystem(spa, top_affected_fs, zep, count, + uaddr, only_count); + + dsl_pool_config_exit(dp, FTAG); + return (error); +} + +static uint64_t +get_errlog_size(spa_t *spa, uint64_t spa_err_obj) +{ + if (spa_err_obj == 0) + return (0); + uint64_t total = 0; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + (void) process_error_block(spa, head_ds, &head_ds_block, + &total, NULL, B_TRUE); + } + zap_cursor_fini(&head_ds_cursor); + } + zap_cursor_fini(&zc); + return (total); +} + +static uint64_t +get_errlist_size(spa_t *spa, avl_tree_t *tree) +{ + if (avl_numnodes(tree) == 0) + return (0); + uint64_t total = 0; + + spa_error_entry_t *se; + for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + /* + * If we cannot find out the head dataset and birth txg of + * the present error block, we opt not to error out. In the + * next pool sync this information will be retrieved by + * sync_error_list() and written to the on-disk error log. + */ + uint64_t head_ds_obj; + if (get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_ds_obj) == 0) + (void) process_error_block(spa, head_ds_obj, &zep, + &total, NULL, B_TRUE); + } + return (total); +} +#endif + /* * Return the number of errors currently in the error log. This is actually the * sum of both the last log and the current log, since we don't know the union @@ -136,83 +519,284 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) uint64_t spa_get_errlog_size(spa_t *spa) { - uint64_t total = 0, count; + uint64_t total = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + mutex_enter(&spa->spa_errlog_lock); + uint64_t count; + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + } else { +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + total += get_errlog_size(spa, spa->spa_errlog_last); + total += get_errlog_size(spa, spa->spa_errlog_scrub); + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += get_errlist_size(spa, &spa->spa_errlist_last); + total += get_errlist_size(spa, &spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); +#endif + } + return (total); +} - mutex_enter(&spa->spa_errlog_lock); - if (spa->spa_errlog_scrub != 0 && - zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, - &count) == 0) - total += count; - - if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && - zap_count(spa->spa_meta_objset, spa->spa_errlog_last, - &count) == 0) - total += count; - mutex_exit(&spa->spa_errlog_lock); +/* + * This function sweeps through an on-disk error log and stores all bookmarks + * as error bookmarks in a new ZAP object. At the end we discard the old one, + * and spa_update_errlog() will set the spa's on-disk error log to new ZAP + * object. + */ +static void +sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_phys_t zb; + uint64_t count; - mutex_enter(&spa->spa_errlist_lock); - total += avl_numnodes(&spa->spa_errlist_last); - total += avl_numnodes(&spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); + *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); - return (total); + /* + * If we cannnot perform the upgrade we should clear the old on-disk + * error logs. + */ + if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) { + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); + return; + } + + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (spa_upgrade_errlog_limit != 0 && + zc.zc_cd == spa_upgrade_errlog_limit) + break; + + name_to_bookmark(za.za_name, &zb); + + zbookmark_err_phys_t zep; + zep.zb_object = zb.zb_object; + zep.zb_level = zb.zb_level; + zep.zb_blkid = zb.zb_blkid; + + /* + * We cannot use get_head_and_birth_txg() because it will + * acquire the pool config lock, which we already have. In case + * of an error we simply continue. + */ + uint64_t head_dataset_obj; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *ds; + objset_t *os; + + int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds); + if (error != 0) + continue; + + head_dataset_obj = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + + /* + * The objset and the dnode are required for getting the block + * pointer, which is used to determine if BP_IS_HOLE(). If + * getting the objset or the dnode fails, do not create a + * zap entry (presuming we know the dataset) as this may create + * spurious errors that we cannot ever resolve. If an error is + * truly persistent, it should re-appear after a scan. + */ + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + continue; + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + continue; + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp, + NULL, NULL); + + zep.zb_birth = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + + if (error != 0 || BP_IS_HOLE(&bp)) + continue; + + uint64_t err_obj; + error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, + head_dataset_obj, &err_obj); + + if (error == ENOENT) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *newobj, head_dataset_obj, err_obj, tx); + } + + char buf[64]; + char *name = ""; + errphys_to_name(&zep, buf, sizeof (buf)); + + (void) zap_update(spa->spa_meta_objset, err_obj, + buf, 1, strlen(name) + 1, name, tx); + } + zap_cursor_fini(&zc); + + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); +} + +void +spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t newobj = 0; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_last != 0) { + sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); + spa->spa_errlog_last = newobj; + } + + if (spa->spa_errlog_scrub != 0) { + sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); + spa->spa_errlog_scrub = newobj; + } + mutex_exit(&spa->spa_errlog_lock); } #ifdef _KERNEL +/* + * If an error block is shared by two datasets it will be counted twice. For + * detailed message see spa_get_errlog_size() above. + */ static int -process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { zap_cursor_t zc; zap_attribute_t za; - zbookmark_phys_t zb; if (obj == 0) return (0); - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (*count == 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(ENOMEM)); + } + + zbookmark_phys_t zb; + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)uaddr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } + *count -= 1; - if (*count == 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(ENOMEM)); } + zap_cursor_fini(&zc); + return (0); + } - name_to_bookmark(za.za_name, &zb); + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { - if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(EFAULT)); + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + + uint64_t head_ds_err_obj = za.za_first_integer; + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + zbookmark_err_phys_t head_ds_block; + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + int error = process_error_block(spa, head_ds, + &head_ds_block, count, uaddr, B_FALSE); + + if (error != 0) { + zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(&zc); + return (error); + } } - - *count -= 1; + zap_cursor_fini(&head_ds_cursor); } - zap_cursor_fini(&zc); - return (0); } static int -process_error_list(avl_tree_t *list, void *addr, size_t *count) +process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) { spa_error_entry_t *se; - for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (se = avl_first(list); se != NULL; + se = AVL_NEXT(list, se)) { - if (*count == 0) - return (SET_ERROR(ENOMEM)); + if (*count == 0) + return (SET_ERROR(ENOMEM)); - if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) - return (SET_ERROR(EFAULT)); + if (copyout(&se->se_bookmark, (char *)uaddr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); - *count -= 1; + *count -= 1; + } + return (0); } + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + uint64_t head_ds_obj; + int error = get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_ds_obj); + if (error != 0) + return (error); + + error = process_error_block(spa, head_ds_obj, &zep, count, + uaddr, B_FALSE); + if (error != 0) + return (error); + } return (0); } #endif @@ -229,7 +813,7 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) * the error list lock when we are finished. */ int -spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) { int ret = 0; @@ -244,10 +828,10 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) mutex_enter(&spa->spa_errlist_lock); if (!ret) - ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, count); if (!ret) - ret = process_error_list(&spa->spa_errlist_last, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, count); mutex_exit(&spa->spa_errlist_lock); @@ -299,35 +883,91 @@ spa_errlog_drain(spa_t *spa) /* * Process a list of errors into the current on-disk log. */ -static void +void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) { spa_error_entry_t *se; char buf[64]; void *cookie; - if (avl_numnodes(t) != 0) { - /* create log if necessary */ - if (*obj == 0) - *obj = zap_create(spa->spa_meta_objset, - DMU_OT_ERROR_LOG, DMU_OT_NONE, - 0, tx); + if (avl_numnodes(t) == 0) + return; + + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); - /* add errors to the current log */ + /* add errors to the current log */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { char *name = se->se_name ? se->se_name : ""; bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, + strlen(name) + 1, name, tx); + } + } else { + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + /* + * If we cannot find out the head dataset and birth txg + * of the present error block, we simply continue. + * Reinserting that error block to the error lists, + * even if we are not syncing the final txg, results + * in duplicate posting of errors. + */ + uint64_t head_dataset_obj; + int error = get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_dataset_obj); + if (error != 0) + continue; + + uint64_t err_obj; + error = zap_lookup_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, &err_obj); + + if (error == ENOENT) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, err_obj, tx); + } + errphys_to_name(&zep, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, - *obj, buf, 1, strlen(name) + 1, name, tx); + err_obj, buf, 1, strlen(name) + 1, name, tx); } + } + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); +} - /* purge the error list */ - cookie = NULL; - while ((se = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); +static void +delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY0(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); } + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); } /* @@ -378,8 +1018,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (scrub_finished) { if (spa->spa_errlog_last != 0) - VERIFY(dmu_object_free(spa->spa_meta_objset, - spa->spa_errlog_last, tx) == 0); + delete_errlog(spa, spa->spa_errlog_last, tx); spa->spa_errlog_last = spa->spa_errlog_scrub; spa->spa_errlog_scrub = 0; @@ -406,6 +1045,137 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) mutex_exit(&spa->spa_errlog_lock); } +static void +delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, + dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + if (head_ds == ds) { + (void) zap_remove(spa->spa_meta_objset, spa_err_obj, + za.za_name, tx); + VERIFY0(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx)); + break; + } + } + zap_cursor_fini(&zc); +} + +void +spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); + delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); + mutex_exit(&spa->spa_errlog_lock); +} + +static int +find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, + uint64_t *txg) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (prev_obj != 0) { + dsl_dataset_rele(ds, FTAG); + if ((error = dsl_dataset_hold_obj(dp, prev_obj, + FTAG, &ds)) == 0 && + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) + break; + + if (error != 0) + return (error); + + prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + ASSERT(prev_obj != 0); + *txg = prev_obj_txg; + return (0); +} + +static void +swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t + old_head, dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + uint64_t old_head_errlog; + int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, + old_head, &old_head_errlog); + + /* If no error log, then there is nothing to do. */ + if (error != 0) + return; + + uint64_t txg; + error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg); + if (error != 0) + return; + + /* + * Create an error log if the file system being promoted does not + * already have one. + */ + uint64_t new_head_errlog; + error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, + &new_head_errlog); + + if (error != 0) { + new_head_errlog = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, + new_head, new_head_errlog, tx); + } + + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_err_phys_t err_block; + for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + char *name = ""; + name_to_errphys(za.za_name, &err_block); + if (err_block.zb_birth < txg) { + (void) zap_update(spa->spa_meta_objset, new_head_errlog, + za.za_name, 1, strlen(name) + 1, name, tx); + + (void) zap_remove(spa->spa_meta_objset, old_head_errlog, + za.za_name, tx); + } + } + zap_cursor_fini(&zc); +} + +void +spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, + dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); + swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); + mutex_exit(&spa->spa_errlog_lock); +} + #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); @@ -415,4 +1185,14 @@ EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); EXPORT_SYMBOL(spa_errlog_sync); EXPORT_SYMBOL(spa_get_errlists); +EXPORT_SYMBOL(spa_delete_dataset_errlog); +EXPORT_SYMBOL(spa_swap_errlog); +EXPORT_SYMBOL(sync_error_list); +EXPORT_SYMBOL(spa_upgrade_errlog); #endif + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, INT, ZMOD_RW, + "Limit the number of errors which will be upgraded to the new " + "on-disk error log when enabling head_errlog"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c index 110a4eab99f9..f831509a4247 100644 --- a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c +++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c @@ -257,7 +257,12 @@ static unsigned long zfs_unflushed_log_block_min = 1000; * terms of performance. Thus we have a hard limit in the size of the log in * terms of blocks. */ -static unsigned long zfs_unflushed_log_block_max = (1ULL << 18); +static unsigned long zfs_unflushed_log_block_max = (1ULL << 17); + +/* + * Also we have a hard limit in the size of the log in terms of dirty TXGs. + */ +static unsigned long zfs_unflushed_log_txg_max = 1000; /* * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and @@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa) return; } - uint64_t calculated_limit = - (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; - spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, + uint64_t msdcount = 0; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + msdcount += e->lse_msdcount; + + uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100; + spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit, zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); } @@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa) } static boolean_t -summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) +summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg) { + if (e->lse_end == txg) + return (0); + if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max, + zfs_max_logsm_summary_length)) + return (1); uint64_t blocks_per_row = MAX(1, DIV_ROUND_UP(spa_log_sm_blocklimit(spa), zfs_max_logsm_summary_length)); @@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) * the metaslab. */ void -spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) +spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty) { /* * We don't track summary data for read-only pools and this function @@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) } target->lse_mscount--; + if (dirty) + target->lse_msdcount--; } /* @@ -490,8 +506,10 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) void spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) { - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + log_summary_entry_t *e = list_head(&spa->spa_log_summary); + if (e->lse_txgcount > 0) + e->lse_txgcount--; + for (; e != NULL; e = list_head(&spa->spa_log_summary)) { if (e->lse_blkcount > blocks_gone) { /* * Assert that we stopped at an entry that is not @@ -560,31 +578,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa) static void summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, - uint64_t nblocks) + uint64_t metaslabs_dirty, uint64_t nblocks) { log_summary_entry_t *e = list_tail(&spa->spa_log_summary); - if (e == NULL || summary_entry_is_full(spa, e)) { + if (e == NULL || summary_entry_is_full(spa, e, txg)) { e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); - e->lse_start = txg; + e->lse_start = e->lse_end = txg; + e->lse_txgcount = 1; list_insert_tail(&spa->spa_log_summary, e); } ASSERT3U(e->lse_start, <=, txg); + if (e->lse_end < txg) { + e->lse_end = txg; + e->lse_txgcount++; + } e->lse_mscount += metaslabs_flushed; + e->lse_msdcount += metaslabs_dirty; e->lse_blkcount += nblocks; } static void spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) { - summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); + summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks); } void -spa_log_summary_add_flushed_metaslab(spa_t *spa) +spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty) { - summary_add_data(spa, spa_syncing_txg(spa), 1, 0); + summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0); +} + +void +spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg) +{ + log_summary_entry_t *target = NULL; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_next(&spa->spa_log_summary, e)) { + if (e->lse_start > txg) + break; + target = e; + } + ASSERT3P(target, !=, NULL); + ASSERT3U(target->lse_mscount, !=, 0); + target->lse_msdcount++; } /* @@ -630,6 +669,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) int64_t available_blocks = spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; + int64_t available_txgs = zfs_unflushed_log_txg_max; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + available_txgs -= e->lse_txgcount; + /* * This variable tells us the total number of flushes needed to * keep the log size within the limit when we reach txgs_in_future. @@ -637,9 +681,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) uint64_t total_flushes = 0; /* Holds the current maximum of our estimates so far. */ - uint64_t max_flushes_pertxg = - MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), - zfs_min_metaslabs_to_flush); + uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush; /* * For our estimations we only look as far in the future @@ -653,11 +695,14 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * then keep skipping TXGs accumulating more blocks * based on the incoming rate until we exceed it. */ - if (available_blocks >= 0) { - uint64_t skip_txgs = (available_blocks / incoming) + 1; + if (available_blocks >= 0 && available_txgs >= 0) { + uint64_t skip_txgs = MIN(available_txgs + 1, + (available_blocks / incoming) + 1); available_blocks -= (skip_txgs * incoming); + available_txgs -= skip_txgs; txgs_in_future += skip_txgs; ASSERT3S(available_blocks, >=, -incoming); + ASSERT3S(available_txgs, >=, -1); } /* @@ -666,9 +711,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * based on the current entry in the summary, updating * our available_blocks. */ - ASSERT3S(available_blocks, <, 0); + ASSERT(available_blocks < 0 || available_txgs < 0); available_blocks += e->lse_blkcount; - total_flushes += e->lse_mscount; + available_txgs += e->lse_txgcount; + total_flushes += e->lse_msdcount; /* * Keep the running maximum of the total_flushes that @@ -680,8 +726,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) */ max_flushes_pertxg = MAX(max_flushes_pertxg, DIV_ROUND_UP(total_flushes, txgs_in_future)); - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - max_flushes_pertxg); } return (max_flushes_pertxg); } @@ -771,14 +815,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) uint64_t want_to_flush; if (spa_flush_all_logs_requested(spa)) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); - want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); + want_to_flush = UINT64_MAX; } else { want_to_flush = spa_estimate_metaslabs_to_flush(spa); } - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - want_to_flush); - /* Used purely for verification purposes */ uint64_t visited = 0; @@ -809,31 +850,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) break; - mutex_enter(&curr->ms_sync_lock); - mutex_enter(&curr->ms_lock); - boolean_t flushed = metaslab_flush(curr, tx); - mutex_exit(&curr->ms_lock); - mutex_exit(&curr->ms_sync_lock); - - /* - * If we failed to flush a metaslab (because it was loading), - * then we are done with the block heuristic as it's not - * possible to destroy any log space maps once you've skipped - * a metaslab. In that case we just set our counter to 0 but - * we continue looping in case there is still memory pressure - * due to unflushed changes. Note that, flushing a metaslab - * that is not the oldest flushed in the pool, will never - * destroy any log space maps [see spa_cleanup_old_sm_logs()]. - */ - if (!flushed) { - want_to_flush = 0; - } else if (want_to_flush > 0) { - want_to_flush--; - } + if (metaslab_unflushed_dirty(curr)) { + mutex_enter(&curr->ms_sync_lock); + mutex_enter(&curr->ms_lock); + metaslab_flush(curr, tx); + mutex_exit(&curr->ms_lock); + mutex_exit(&curr->ms_sync_lock); + if (want_to_flush > 0) + want_to_flush--; + } else + metaslab_unflushed_bump(curr, tx, B_FALSE); visited++; } ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); + + spa_log_sm_set_blocklimit(spa); } /* @@ -904,6 +936,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) avl_remove(&spa->spa_sm_logs_by_txg, sls); space_map_free_obj(mos, sls->sls_sm_obj, tx); VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); + spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks); spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; kmem_free(sls, sizeof (spa_log_sm_t)); } @@ -963,12 +996,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); - /* - * If the log space map feature was just enabled, the blocklimit - * has not yet been set. - */ - if (spa_log_sm_blocklimit(spa) == 0) - spa_log_sm_set_blocklimit(spa); + spa_log_sm_set_blocklimit(spa); } /* @@ -1094,12 +1122,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) panic("invalid maptype_t"); break; } + if (!metaslab_unflushed_dirty(ms)) { + metaslab_set_unflushed_dirty(ms, B_TRUE); + spa_log_summary_dirty_flushed_metaslab(spa, + metaslab_unflushed_txg(ms)); + } return (0); } static int spa_ld_log_sm_data(spa_t *spa) { + spa_log_sm_t *sls, *psls; int error = 0; /* @@ -1113,41 +1147,71 @@ spa_ld_log_sm_data(spa_t *spa) ASSERT0(spa->spa_unflushed_stats.sus_memused); hrtime_t read_logs_starttime = gethrtime(); - /* this is a no-op when we don't have space map logs */ - for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); - sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - space_map_t *sm = NULL; - error = space_map_open(&sm, spa_meta_objset(spa), - sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); - if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " - "space_map_open(obj=%llu) [error %d]", - (u_longlong_t)sls->sls_sm_obj, error); - goto out; + + /* Prefetch log spacemaps dnodes. */ + for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + } + + uint_t pn = 0; + uint64_t ps = 0; + psls = sls = avl_first(&spa->spa_sm_logs_by_txg); + while (sls != NULL) { + /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ + if (psls != NULL && pn < 16 && + (pn < 2 || ps < 2 * dmu_prefetch_max)) { + error = space_map_open(&psls->sls_sm, + spa_meta_objset(spa), psls->sls_sm_obj, 0, + UINT64_MAX, SPA_MINBLOCKSHIFT); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data(): " + "failed at space_map_open(obj=%llu) " + "[error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj, + 0, 0, space_map_length(psls->sls_sm), + ZIO_PRIORITY_ASYNC_READ); + pn++; + ps += space_map_length(psls->sls_sm); + psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls); + continue; } + /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ + cond_resched(); + ASSERT0(sls->sls_nblocks); + sls->sls_nblocks = space_map_nblocks(sls->sls_sm); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + summary_add_data(spa, sls->sls_txg, + sls->sls_mscount, 0, sls->sls_nblocks); + struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg }; - error = space_map_iterate(sm, space_map_length(sm), - spa_ld_log_sm_cb, &vla); + error = space_map_iterate(sls->sls_sm, + space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla); if (error != 0) { - space_map_close(sm); spa_load_failed(spa, "spa_ld_log_sm_data(): failed " "at space_map_iterate(obj=%llu) [error %d]", (u_longlong_t)sls->sls_sm_obj, error); goto out; } - ASSERT0(sls->sls_nblocks); - sls->sls_nblocks = space_map_nblocks(sm); - spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; - summary_add_data(spa, sls->sls_txg, - sls->sls_mscount, sls->sls_nblocks); + pn--; + ps -= space_map_length(sls->sls_sm); + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); - space_map_close(sm); + /* Update log block limits considering just loaded. */ + spa_log_sm_set_blocklimit(spa); } + hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " @@ -1157,6 +1221,18 @@ spa_ld_log_sm_data(spa_t *spa) (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); out: + if (error != 0) { + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + if (sls->sls_sm) { + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + } + } + } else { + ASSERT0(pn); + ASSERT0(ps); + } /* * Now that the metaslabs contain their unflushed changes: * [1] recalculate their actual allocated space @@ -1237,6 +1313,9 @@ spa_ld_unflushed_txgs(vdev_t *vd) } ms->ms_unflushed_txg = entry.msp_unflushed_txg; + ms->ms_unflushed_dirty = B_FALSE; + ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(ms->ms_unflushed_frees)); if (ms->ms_unflushed_txg != 0) { mutex_enter(&spa->spa_flushed_ms_lock); avl_add(&spa->spa_metaslabs_by_flushed, ms); @@ -1300,6 +1379,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW, "Lower-bound limit for the maximum amount of blocks allowed in " "log spacemap (see zfs_unflushed_log_block_max)"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW, + "Hard limit (upper-bound) in the size of the space map log " + "in terms of dirty TXGs."); + ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW, "Tunable used to determine the number of blocks that can be used for " "the spacemap log, expressed as a percentage of the total number of " diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index db2d2c5e44fb..ce7f020a0d86 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -1523,13 +1523,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); - /* - * Regardless whether this vdev was just added or it is being - * expanded, the metaslab count has changed. Recalculate the - * block limit. - */ - spa_log_sm_set_blocklimit(spa); - return (0); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 17f9d6c90804..5508d273758d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -1386,7 +1386,6 @@ vdev_remove_complete(spa_t *spa) vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; - spa_log_sm_set_blocklimit(spa); } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); @@ -2131,7 +2130,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) * metaslab_class_histogram_verify() */ vdev_metaslab_fini(vd); - spa_log_sm_set_blocklimit(spa); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); *txg = spa_vdev_config_enter(spa); @@ -2251,7 +2249,6 @@ spa_vdev_remove_top_check(vdev_t *vd) * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; - int num_indirect = 0; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; @@ -2267,8 +2264,6 @@ spa_vdev_remove_top_check(vdev_t *vd) if (cvd->vdev_ashift != 0 && cvd->vdev_alloc_bias == VDEV_BIAS_NONE) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); - if (cvd->vdev_ops == &vdev_indirect_ops) - num_indirect++; if (!vdev_is_concrete(cvd)) continue; if (vdev_get_nparity(cvd) != 0) diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c index 9d16fff81d0a..fc9167aa6611 100644 --- a/sys/contrib/openzfs/module/zfs/zfeature.c +++ b/sys/contrib/openzfs/module/zfs/zfeature.c @@ -389,6 +389,13 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) && feature->fi_feature == SPA_FEATURE_BOOKMARK_V2) spa->spa_errata = 0; + + /* + * Convert the old on-disk error log to the new format when activating + * the head_errlog feature. + */ + if (feature->fi_feature == SPA_FEATURE_HEAD_ERRLOG) + spa_upgrade_errlog(spa, tx); } static void diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index a2824c5cc804..b3f32d64f3ef 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -5670,7 +5670,7 @@ zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; + uint64_t count = zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 62806e9fe8b1..a039b4da2833 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); + atomic_dec_32(&zp->z_sync_writes_cnt); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); @@ -357,11 +359,11 @@ zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { vattr_t va = {0}; - va.va_mask = AT_MODE; + va.va_mask = ATTR_MODE; va.va_nodeid = zp->z_id; va.va_mode = newmode; - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, AT_MODE, - NULL); + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, + ATTR_MODE, NULL); *clear_setid_bits_txgp = dmu_tx_get_txg(tx); } } else { diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index f6adea572418..2a16d5cef2e2 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -166,15 +166,6 @@ zio_init(void) cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; -#if defined(_ILP32) && defined(_KERNEL) - /* - * Cache size limited to 1M on 32-bit platforms until ARC - * buffers no longer require virtual address space. - */ - if (size > zfs_max_recordsize) - break; -#endif - while (!ISP2(p2)) p2 &= p2 - 1; diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index eb68b05c567b..ac7c3a0c3232 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -513,6 +513,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ + zvol_replay_err, /* TX_SETSAXATTR */ }; /* |