diff options
Diffstat (limited to 'include/sys')
134 files changed, 2223 insertions, 779 deletions
diff --git a/include/sys/abd.h b/include/sys/abd.h index 5c6bd0c271d4..19fe96292d5f 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -79,6 +79,9 @@ typedef struct abd { typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); +#if defined(__linux__) && defined(_KERNEL) +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); +#endif extern int zfs_abd_scatter_enabled; @@ -86,10 +89,15 @@ extern int zfs_abd_scatter_enabled; * Allocations and deallocations */ +__attribute__((malloc)) abd_t *abd_alloc(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_linear(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_gang(void); +__attribute__((malloc)) abd_t *abd_alloc_for_io(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_sametype(abd_t *, size_t); boolean_t abd_size_alloc_linear(size_t); void abd_gang_add(abd_t *, abd_t *, boolean_t); @@ -120,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); +#if defined(__linux__) && defined(_KERNEL) +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, + void *); +#endif void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); @@ -128,11 +140,11 @@ int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); void abd_verify(abd_t *); -void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, + size_t csize, size_t dsize, const unsigned parity, void (*func_raidz_gen)(void **, const void *, size_t, size_t)); void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, + size_t tsize, const unsigned parity, void (*func_raidz_rec)(void **t, const size_t tsize, void **c, const unsigned *mul), const unsigned *mul); @@ -208,6 +220,8 @@ void abd_fini(void); /* * Linux ABD bio functions + * Note: these are only needed to support vdev_classic. See comment in + * vdev_disk.c. */ #if defined(__linux__) && defined(_KERNEL) unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index e96f1edfc8ce..f88ea25e245d 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #ifndef _ABD_IMPL_H @@ -38,12 +39,30 @@ typedef enum abd_stats_op { ABDSTAT_DECR /* Decrease abdstat values */ } abd_stats_op_t; -struct scatterlist; /* forward declaration */ +/* forward declarations */ +struct scatterlist; +struct page; struct abd_iter { /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ + union { + /* for abd_iter_map()/abd_iter_unmap() */ + struct { + /* addr corresponding to iter_pos */ + void *iter_mapaddr; + /* length of data valid at mapaddr */ + size_t iter_mapsize; + }; + /* for abd_iter_page() */ + struct { + /* current page */ + struct page *iter_page; + /* offset of data in page */ + size_t iter_page_doff; + /* size of data in page */ + size_t iter_page_dsize; + }; + }; /* private */ abd_t *iter_abd; /* ABD being iterated through */ @@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *); void abd_iter_advance(struct abd_iter *, size_t); void abd_iter_map(struct abd_iter *); void abd_iter_unmap(struct abd_iter *); +void abd_iter_page(struct abd_iter *); /* * Helper macros diff --git a/include/sys/arc.h b/include/sys/arc.h index 8cee8be4bc93..05307aab99e3 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -81,10 +81,10 @@ typedef struct arc_prune arc_prune_t; typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *priv); typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); -typedef void arc_prune_func_t(int64_t bytes, void *priv); +typedef void arc_prune_func_t(uint64_t bytes, void *priv); /* Shared module parameters */ -extern int zfs_arc_average_blocksize; +extern uint_t zfs_arc_average_blocksize; extern int l2arc_exclude_special; /* generic arc_done_func_t's which you can use */ @@ -115,7 +115,7 @@ typedef enum arc_flags ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ - ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ + ARC_FLAG_UNCACHED = 1 << 5, /* evict after use */ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* @@ -195,13 +195,11 @@ typedef enum arc_buf_flags { struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; - kmutex_t b_evict_lock; void *b_data; arc_buf_flags_t b_flags; }; typedef enum arc_buf_contents { - ARC_BUFC_INVALID, /* invalid type */ ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES @@ -229,6 +227,7 @@ typedef enum arc_state_type { ARC_STATE_MFU, ARC_STATE_MFU_GHOST, ARC_STATE_L2C_ONLY, + ARC_STATE_UNCACHED, ARC_STATE_NUMTYPES } arc_state_type_t; @@ -302,12 +301,11 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *priv, zio_priority_t priority, int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, +zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, - arc_write_done_func_t *physdone, arc_write_done_func_t *done, - void *priv, zio_priority_t priority, int zio_flags, - const zbookmark_phys_t *zb); + arc_write_done_func_t *done, void *priv, zio_priority_t priority, + int zio_flags, const zbookmark_phys_t *zb); arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 755e87fe6e0e..defebe3b2fbb 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -30,6 +30,7 @@ #define _SYS_ARC_IMPL_H #include <sys/arc.h> +#include <sys/multilist.h> #include <sys/zio_crypt.h> #include <sys/zthr.h> #include <sys/aggsum.h> @@ -46,6 +47,7 @@ extern "C" { * ARC_mru_ghost - recently used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_uncached - uncacheable prefetch, to be evicted * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are @@ -81,14 +83,17 @@ typedef struct arc_state { */ arc_state_type_t arcs_state; /* + * total amount of data in this state. + */ + zfs_refcount_t arcs_size[ARC_BUFC_NUMTYPES] ____cacheline_aligned; + /* * total amount of evictable data in this state */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned; + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + * amount of hit bytes for this state (counted only for ghost states) */ - zfs_refcount_t arcs_size; + wmsum_t arcs_hits[ARC_BUFC_NUMTYPES]; } arc_state_t; typedef struct arc_callback arc_callback_t; @@ -101,9 +106,14 @@ struct arc_callback { boolean_t acb_compressed; boolean_t acb_noauth; boolean_t acb_nobuf; + boolean_t acb_wait; + int acb_wait_error; + kmutex_t acb_wait_lock; + kcondvar_t acb_wait_cv; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; + arc_callback_t *acb_prev; arc_callback_t *acb_next; }; @@ -113,7 +123,6 @@ struct arc_write_callback { void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; @@ -150,13 +159,6 @@ struct arc_write_callback { * these two allocation states. */ typedef struct l1arc_buf_hdr { - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - - /* for waiting on reads to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; @@ -167,7 +169,7 @@ typedef struct l1arc_buf_hdr { uint32_t b_mru_ghost_hits; uint32_t b_mfu_hits; uint32_t b_mfu_ghost_hits; - uint32_t b_bufcnt; + uint8_t b_byteswap; arc_buf_t *b_buf; /* self protecting */ @@ -175,6 +177,11 @@ typedef struct l1arc_buf_hdr { arc_callback_t *b_acb; abd_t *b_pabd; + +#ifdef ZFS_DEBUG + zio_cksum_t *b_freeze_cksum; + kmutex_t b_freeze_lock; +#endif } l1arc_buf_hdr_t; typedef enum l2arc_dev_hdr_flags_t { @@ -349,8 +356,9 @@ typedef struct l2arc_lb_ptr_buf { #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) -#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) -#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +/* +/- 1 here are to keep compatibility after ARC_BUFC_INVALID removal. */ +#define L2BLK_GET_TYPE(field) (BF64_GET((field), 48, 8) - 1) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, (x) + 1) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) @@ -424,12 +432,12 @@ typedef struct l2arc_dev { */ typedef struct arc_buf_hdr_crypt { abd_t *b_rabd; /* raw encrypted data */ - dmu_object_type_t b_ot; /* object type */ - uint32_t b_ebufcnt; /* count of encrypted buffers */ /* dsobj for looking up encryption key for l2arc encryption */ uint64_t b_dsobj; + dmu_object_type_t b_ot; /* object type */ + /* encryption parameters */ uint8_t b_salt[ZIO_DATA_SALT_LEN]; uint8_t b_iv[ZIO_DATA_IV_LEN]; @@ -511,20 +519,33 @@ struct arc_buf_hdr { }; typedef struct arc_stats { + /* Number of requests that were satisfied without I/O. */ kstat_named_t arcstat_hits; + /* Number of requests for which I/O was already running. */ + kstat_named_t arcstat_iohits; + /* Number of requests for which I/O has to be issued. */ kstat_named_t arcstat_misses; + /* Same three, but specifically for demand data. */ kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_iohits; kstat_named_t arcstat_demand_data_misses; + /* Same three, but specifically for demand metadata. */ kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_iohits; kstat_named_t arcstat_demand_metadata_misses; + /* Same three, but specifically for prefetch data. */ kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_iohits; kstat_named_t arcstat_prefetch_data_misses; + /* Same three, but specifically for prefetch metadata. */ kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_iohits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_uncached_hits; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock @@ -560,7 +581,9 @@ typedef struct arc_stats { kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; + kstat_named_t arcstat_meta; + kstat_named_t arcstat_pd; + kstat_named_t arcstat_pm; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; @@ -633,6 +656,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_anon_size; + kstat_named_t arcstat_anon_data; + kstat_named_t arcstat_anon_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -654,6 +679,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mru_size; + kstat_named_t arcstat_mru_data; + kstat_named_t arcstat_mru_metadata; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, @@ -678,6 +705,8 @@ typedef struct arc_stats { * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; + kstat_named_t arcstat_mru_ghost_data; + kstat_named_t arcstat_mru_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -697,6 +726,8 @@ typedef struct arc_stats { * are all included in this value. */ kstat_named_t arcstat_mfu_size; + kstat_named_t arcstat_mfu_data; + kstat_named_t arcstat_mfu_metadata; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu @@ -715,6 +746,8 @@ typedef struct arc_stats { * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; + kstat_named_t arcstat_mfu_ghost_data; + kstat_named_t arcstat_mfu_ghost_metadata; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type @@ -727,6 +760,23 @@ typedef struct arc_stats { * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; + /* + * Total number of bytes that are going to be evicted from ARC due to + * ARC_FLAG_UNCACHED being set. + */ + kstat_named_t arcstat_uncached_size; + kstat_named_t arcstat_uncached_data; + kstat_named_t arcstat_uncached_metadata; + /* + * Number of data bytes that are going to be evicted from ARC due to + * ARC_FLAG_UNCACHED being set. + */ + kstat_named_t arcstat_uncached_evictable_data; + /* + * Number of metadata bytes that that are going to be evicted from ARC + * due to ARC_FLAG_UNCACHED being set. + */ + kstat_named_t arcstat_uncached_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; /* @@ -839,13 +889,20 @@ typedef struct arc_stats { kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_dnode_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; + /* Number of predictive prefetch requests. */ + kstat_named_t arcstat_predictive_prefetch; + /* Number of requests for which predictive prefetch has completed. */ kstat_named_t arcstat_demand_hit_predictive_prefetch; + /* Number of requests for which predictive prefetch was running. */ + kstat_named_t arcstat_demand_iohit_predictive_prefetch; + /* Number of prescient prefetch requests. */ + kstat_named_t arcstat_prescient_prefetch; + /* Number of requests for which prescient prefetch has completed. */ kstat_named_t arcstat_demand_hit_prescient_prefetch; + /* Number of requests for which prescient prefetch was running. */ + kstat_named_t arcstat_demand_iohit_prescient_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; kstat_named_t arcstat_raw_size; @@ -855,19 +912,25 @@ typedef struct arc_stats { typedef struct arc_sums { wmsum_t arcstat_hits; + wmsum_t arcstat_iohits; wmsum_t arcstat_misses; wmsum_t arcstat_demand_data_hits; + wmsum_t arcstat_demand_data_iohits; wmsum_t arcstat_demand_data_misses; wmsum_t arcstat_demand_metadata_hits; + wmsum_t arcstat_demand_metadata_iohits; wmsum_t arcstat_demand_metadata_misses; wmsum_t arcstat_prefetch_data_hits; + wmsum_t arcstat_prefetch_data_iohits; wmsum_t arcstat_prefetch_data_misses; wmsum_t arcstat_prefetch_metadata_hits; + wmsum_t arcstat_prefetch_metadata_iohits; wmsum_t arcstat_prefetch_metadata_misses; wmsum_t arcstat_mru_hits; wmsum_t arcstat_mru_ghost_hits; wmsum_t arcstat_mfu_hits; wmsum_t arcstat_mfu_ghost_hits; + wmsum_t arcstat_uncached_hits; wmsum_t arcstat_deleted; wmsum_t arcstat_mutex_miss; wmsum_t arcstat_access_skip; @@ -889,7 +952,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - aggsum_t arcstat_dnode_size; + wmsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; @@ -934,10 +997,14 @@ typedef struct arc_sums { wmsum_t arcstat_memory_direct_count; wmsum_t arcstat_memory_indirect_count; wmsum_t arcstat_prune; - aggsum_t arcstat_meta_used; + wmsum_t arcstat_meta_used; wmsum_t arcstat_async_upgrade_sync; + wmsum_t arcstat_predictive_prefetch; wmsum_t arcstat_demand_hit_predictive_prefetch; + wmsum_t arcstat_demand_iohit_predictive_prefetch; + wmsum_t arcstat_prescient_prefetch; wmsum_t arcstat_demand_hit_prescient_prefetch; + wmsum_t arcstat_demand_iohit_prescient_prefetch; wmsum_t arcstat_raw_size; wmsum_t arcstat_cached_only_in_progress; wmsum_t arcstat_abd_chunk_waste_size; @@ -958,7 +1025,9 @@ typedef struct arc_evict_waiter { #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_meta ARCSTAT(arcstat_meta) /* target frac of metadata */ +#define arc_pd ARCSTAT(arcstat_pd) /* target frac of data MRU */ +#define arc_pm ARCSTAT(arcstat_pm) /* target frac of meta MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ @@ -970,23 +1039,24 @@ typedef struct arc_evict_waiter { #define arc_mfu (&ARC_mfu) #define arc_mfu_ghost (&ARC_mfu_ghost) #define arc_l2c_only (&ARC_l2c_only) +#define arc_uncached (&ARC_uncached) extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; extern arc_sums_t arc_sums; extern hrtime_t arc_growtime; extern boolean_t arc_warm; -extern int arc_grow_retry; -extern int arc_no_grow_shift; -extern int arc_shrink_shift; +extern uint_t arc_grow_retry; +extern uint_t arc_no_grow_shift; +extern uint_t arc_shrink_shift; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; extern arc_state_t ARC_mfu; extern arc_state_t ARC_mru; extern uint_t zfs_arc_pc_percent; -extern int arc_lotsfree_percent; -extern unsigned long zfs_arc_min; -extern unsigned long zfs_arc_max; +extern uint_t arc_lotsfree_percent; +extern uint64_t zfs_arc_min; +extern uint64_t zfs_arc_max; extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); @@ -995,7 +1065,6 @@ extern void arc_wait_for_eviction(uint64_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); -extern void arc_prune_async(int64_t); extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); @@ -1003,7 +1072,7 @@ extern void arc_tuning_update(boolean_t); extern void arc_register_hotplug(void); extern void arc_unregister_hotplug(void); -extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); +extern int param_set_arc_u64(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); diff --git a/include/sys/spa_boot.h b/include/sys/asm_linkage.h index 1d3622f5a108..749157d4c3db 100644 --- a/include/sys/spa_boot.h +++ b/include/sys/asm_linkage.h @@ -2,11 +2,12 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -19,24 +20,29 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#ifndef _SYS_SPA_BOOT_H -#define _SYS_SPA_BOOT_H +#ifndef _SYS_ASM_LINKAGE_H +#define _SYS_ASM_LINKAGE_H -#include <sys/nvpair.h> +#define ASMABI + +#if defined(__i386) || defined(__amd64) + +#include <sys/ia32/asm_linkage.h> /* XX64 x86/sys/asm_linkage.h */ -#ifdef __cplusplus -extern "C" { #endif -extern char *spa_get_bootprop(char *prop); -extern void spa_free_bootprop(char *prop); +#if defined(_KERNEL) && defined(HAVE_KERNEL_OBJTOOL) -#ifdef __cplusplus -} +#include <asm/frame.h> + +#else /* userspace */ +#define FRAME_BEGIN +#define FRAME_END #endif -#endif /* _SYS_SPA_BOOT_H */ + +#endif /* _SYS_ASM_LINKAGE_H */ diff --git a/include/sys/avl.h b/include/sys/avl.h index 20e88f2a6b06..8818e3edb292 100644 --- a/include/sys/avl.h +++ b/include/sys/avl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/avl_impl.h b/include/sys/avl_impl.h index c464a62a1ca6..85277b42b471 100644 --- a/include/sys/avl_impl.h +++ b/include/sys/avl_impl.h @@ -7,7 +7,7 @@ * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/bitmap.h b/include/sys/bitmap.h new file mode 100644 index 000000000000..71eeba592cfd --- /dev/null +++ b/include/sys/bitmap.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_BITMAP_H +#define _SYS_BITMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Operations on bitmaps of arbitrary size + * A bitmap is a vector of 1 or more ulong_t's. + * The user of the package is responsible for range checks and keeping + * track of sizes. + */ + +#ifdef _LP64 +#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */ +#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#else +#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#endif + +#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */ +#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */ + +/* + * bitmap is a ulong_t *, bitindex an index_t + * + * The macros BT_WIM and BT_BIW internal; there is no need + * for users of this package to use them. + */ + +/* + * word in map + */ +#define BT_WIM(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT]) +/* + * bit in word + */ +#define BT_BIW(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK)) + +/* + * These are public macros + * + * BT_BITOUL == n bits to n ulong_t's + */ +#define BT_BITOUL(nbits) \ + (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL) +#define BT_SIZEOFMAP(nbits) \ + (BT_BITOUL(nbits) * sizeof (ulong_t)) +#define BT_TEST(bitmap, bitindex) \ + ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0) +#define BT_SET(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); } +#define BT_CLEAR(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); } + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITMAP_H */ diff --git a/include/sys/bitops.h b/include/sys/bitops.h index 69d07d76552a..5c477b38b205 100644 --- a/include/sys/bitops.h +++ b/include/sys/bitops.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/blake3.h b/include/sys/blake3.h index b3391c5f2349..b981b18db943 100644 --- a/include/sys/blake3.h +++ b/include/sys/blake3.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,11 +22,11 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor - * Copyright (c) 2021 Tino Reichardt <milky-zfs@mcmilk.de> + * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> */ -#ifndef BLAKE3_H -#define BLAKE3_H +#ifndef _SYS_BLAKE3_H +#define _SYS_BLAKE3_H #ifdef _KERNEL #include <sys/types.h> @@ -72,7 +72,7 @@ typedef struct { */ uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; - /* const blake3_impl_ops_t *ops */ + /* const blake3_ops_t *ops */ const void *ops; } BLAKE3_CTX; @@ -97,29 +97,8 @@ extern void **blake3_per_cpu_ctx; extern void blake3_per_cpu_ctx_init(void); extern void blake3_per_cpu_ctx_fini(void); -/* return number of supported implementations */ -extern int blake3_get_impl_count(void); - -/* return id of selected implementation */ -extern int blake3_get_impl_id(void); - -/* return name of selected implementation */ -extern const char *blake3_get_impl_name(void); - -/* setup id as fastest implementation */ -extern void blake3_set_impl_fastest(uint32_t id); - -/* set implementation by id */ -extern void blake3_set_impl_id(uint32_t id); - -/* set implementation by name */ -extern int blake3_set_impl_name(const char *name); - -/* set startup implementation */ -extern void blake3_setup_impl(void); - #ifdef __cplusplus } #endif -#endif /* BLAKE3_H */ +#endif /* _SYS_BLAKE3_H */ diff --git a/include/sys/bplist.h b/include/sys/bplist.h index f8deaf8437e6..53dd346767fe 100644 --- a/include/sys/bplist.h +++ b/include/sys/bplist.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index 16e403526cff..81bc0fe21086 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -60,7 +60,7 @@ typedef struct bpobj { kmutex_t bpo_lock; objset_t *bpo_os; uint64_t bpo_object; - int bpo_epb; + uint32_t bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; uint8_t bpo_havefreed; @@ -87,6 +87,7 @@ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, int64_t start); void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj); void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx); diff --git a/include/sys/bptree.h b/include/sys/bptree.h index 327c128bf493..9d189446ab69 100644 --- a/include/sys/bptree.h +++ b/include/sys/bptree.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h index 797aecd791a3..edcee16227ec 100644 --- a/include/sys/bqueue.h +++ b/include/sys/bqueue.h @@ -27,27 +27,30 @@ extern "C" { typedef struct bqueue { list_t bq_list; + size_t bq_size; + list_t bq_dequeuing_list; + size_t bq_dequeuing_size; + list_t bq_enqueuing_list; + size_t bq_enqueuing_size; kmutex_t bq_lock; kcondvar_t bq_add_cv; kcondvar_t bq_pop_cv; - uint64_t bq_size; - uint64_t bq_maxsize; - uint64_t bq_fill_fraction; + size_t bq_maxsize; + uint_t bq_fill_fraction; size_t bq_node_offset; } bqueue_t; typedef struct bqueue_node { list_node_t bqn_node; - uint64_t bqn_size; + size_t bqn_size; } bqueue_node_t; -int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t); +int bqueue_init(bqueue_t *, uint_t, size_t, size_t); void bqueue_destroy(bqueue_t *); -void bqueue_enqueue(bqueue_t *, void *, uint64_t); -void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t); +void bqueue_enqueue(bqueue_t *, void *, size_t); +void bqueue_enqueue_flush(bqueue_t *, void *, size_t); void *bqueue_dequeue(bqueue_t *); -boolean_t bqueue_empty(bqueue_t *); #ifdef __cplusplus } diff --git a/include/sys/brt.h b/include/sys/brt.h new file mode 100644 index 000000000000..f73df95058d9 --- /dev/null +++ b/include/sys/brt.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#ifndef _SYS_BRT_H +#define _SYS_BRT_H + +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/dmu.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp); +extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp); + +extern uint64_t brt_get_dspace(spa_t *spa); +extern uint64_t brt_get_used(spa_t *spa); +extern uint64_t brt_get_saved(spa_t *spa); +extern uint64_t brt_get_ratio(spa_t *spa); + +extern boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp); +extern void brt_init(void); +extern void brt_fini(void); + +extern void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx); +extern void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx); +extern void brt_pending_apply(spa_t *spa, uint64_t txg); + +extern void brt_create(spa_t *spa); +extern int brt_load(spa_t *spa); +extern void brt_unload(spa_t *spa); +extern void brt_sync(spa_t *spa, uint64_t txg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BRT_H */ diff --git a/include/sys/brt_impl.h b/include/sys/brt_impl.h new file mode 100644 index 000000000000..9cc06fbb2c3a --- /dev/null +++ b/include/sys/brt_impl.h @@ -0,0 +1,199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#ifndef _SYS_BRT_IMPL_H +#define _SYS_BRT_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * BRT - Block Reference Table. + */ +#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" + +/* + * We divide each VDEV into 16MB chunks. Each chunk is represented in memory + * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B + * Each element in this array represents how many BRT entries do we have in this + * chunk of storage. We always load this entire array into memory and update as + * needed. By having it in memory we can quickly tell (during zio_free()) if + * there are any BRT entries that we might need to update. + * + * This value cannot be larger than 16MB, at least as long as we support + * 512 byte block sizes. With 512 byte block size we can have exactly + * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too + * many for a 16bit counter. + */ +#define BRT_RANGESIZE (16 * 1024 * 1024) +_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, + "BRT_RANGESIZE is too large."); +/* + * We don't want to update the whole structure every time. Maintain bitmap + * of dirty blocks within the regions, so that a single bit represents a + * block size of entcounts. For example if we have a 1PB vdev then all + * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this + * 128MB array of entcounts into 32kB disk blocks, as we don't want to update + * the whole 128MB on disk when we have updated only a single entcount. + * We maintain a bitmap where each 32kB disk block within 128MB entcounts array + * is represented by a single bit. This gives us 4096 bits. A set bit in the + * bitmap means that we had a change in at least one of the 16384 entcounts + * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). + */ +#define BRT_BLOCKSIZE (32 * 1024) +#define BRT_RANGESIZE_TO_NBLOCKS(size) \ + (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) + +#define BRT_LITTLE_ENDIAN 0 +#define BRT_BIG_ENDIAN 1 +#ifdef _ZFS_LITTLE_ENDIAN +#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#else +#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#endif + +typedef struct brt_vdev_phys { + uint64_t bvp_mos_entries; + uint64_t bvp_size; + uint64_t bvp_byteorder; + uint64_t bvp_totalcount; + uint64_t bvp_rangesize; + uint64_t bvp_usedspace; + uint64_t bvp_savedspace; +} brt_vdev_phys_t; + +typedef struct brt_vdev { + /* + * VDEV id. + */ + uint64_t bv_vdevid; + /* + * Is the structure initiated? + * (bv_entcount and bv_bitmap are allocated?) + */ + boolean_t bv_initiated; + /* + * Object number in the MOS for the entcount array and brt_vdev_phys. + */ + uint64_t bv_mos_brtvdev; + /* + * Object number in the MOS for the entries table. + */ + uint64_t bv_mos_entries; + /* + * Entries to sync. + */ + avl_tree_t bv_tree; + /* + * Does the bv_entcount[] array needs byte swapping? + */ + boolean_t bv_need_byteswap; + /* + * Number of entries in the bv_entcount[] array. + */ + uint64_t bv_size; + /* + * This is the array with BRT entry count per BRT_RANGESIZE. + */ + uint16_t *bv_entcount; + /* + * Sum of all bv_entcount[]s. + */ + uint64_t bv_totalcount; + /* + * Space on disk occupied by cloned blocks (without compression). + */ + uint64_t bv_usedspace; + /* + * How much additional space would be occupied without block cloning. + */ + uint64_t bv_savedspace; + /* + * brt_vdev_phys needs updating on disk. + */ + boolean_t bv_meta_dirty; + /* + * bv_entcount[] needs updating on disk. + */ + boolean_t bv_entcount_dirty; + /* + * bv_entcount[] potentially can be a bit too big to sychronize it all + * when we just changed few entcounts. The fields below allow us to + * track updates to bv_entcount[] array since the last sync. + * A single bit in the bv_bitmap represents as many entcounts as can + * fit into a single BRT_BLOCKSIZE. + * For example we have 65536 entcounts in the bv_entcount array + * (so the whole array is 128kB). We updated bv_entcount[2] and + * bv_entcount[5]. In that case only first bit in the bv_bitmap will + * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + */ + ulong_t *bv_bitmap; + uint64_t bv_nblocks; +} brt_vdev_t; + +/* + * In-core brt + */ +typedef struct brt { + krwlock_t brt_lock; + spa_t *brt_spa; +#define brt_mos brt_spa->spa_meta_objset + uint64_t brt_rangesize; + uint64_t brt_usedspace; + uint64_t brt_savedspace; + avl_tree_t brt_pending_tree[TXG_SIZE]; + kmutex_t brt_pending_lock[TXG_SIZE]; + /* Sum of all entries across all bv_trees. */ + uint64_t brt_nentries; + brt_vdev_t *brt_vdevs; + uint64_t brt_nvdevs; +} brt_t; + +/* Size of bre_offset / sizeof (uint64_t). */ +#define BRT_KEY_WORDS (1) + +/* + * In-core brt entry. + * On-disk we use bre_offset as the key and bre_refcount as the value. + */ +typedef struct brt_entry { + uint64_t bre_offset; + uint64_t bre_refcount; + avl_node_t bre_node; +} brt_entry_t; + +typedef struct brt_pending_entry { + blkptr_t bpe_bp; + int bpe_count; + avl_node_t bpe_node; +} brt_pending_entry_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BRT_IMPL_H */ diff --git a/include/sys/btree.h b/include/sys/btree.h index a901d654ef1c..6e05eee8f01d 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -65,7 +65,7 @@ extern "C" { * them, and increased memory overhead. Increasing these values results in * higher variance in operation time, and reduces memory overhead. */ -#define BTREE_CORE_ELEMS 128 +#define BTREE_CORE_ELEMS 126 #define BTREE_LEAF_SIZE 4096 extern kmem_cache_t *zfs_btree_leaf_cache; @@ -95,9 +95,6 @@ typedef struct zfs_btree_leaf { uint8_t btl_elems[]; } zfs_btree_leaf_t; -#define BTREE_LEAF_ESIZE (BTREE_LEAF_SIZE - \ - offsetof(zfs_btree_leaf_t, btl_elems)) - typedef struct zfs_btree_index { zfs_btree_hdr_t *bti_node; uint32_t bti_offset; @@ -108,16 +105,69 @@ typedef struct zfs_btree_index { boolean_t bti_before; } zfs_btree_index_t; -typedef struct btree { - zfs_btree_hdr_t *bt_root; - int64_t bt_height; +typedef struct btree zfs_btree_t; +typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t, + const void *, zfs_btree_index_t *); + +struct btree { + int (*bt_compar) (const void *, const void *); + bt_find_in_buf_f bt_find_in_buf; size_t bt_elem_size; + size_t bt_leaf_size; uint32_t bt_leaf_cap; + int32_t bt_height; uint64_t bt_num_elems; uint64_t bt_num_nodes; + zfs_btree_hdr_t *bt_root; zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading - int (*bt_compar) (const void *, const void *); -} zfs_btree_t; +}; + +/* + * Implementation of Shar's algorithm designed to accelerate binary search by + * eliminating impossible to predict branches. + * + * For optimality, this should be used to generate the search function in the + * same file as the comparator and the comparator should be marked + * `__attribute__((always_inline) inline` so that the compiler will inline it. + * + * Arguments are: + * + * NAME - The function name for this instance of the search function. Use it + * in a subsequent call to zfs_btree_create(). + * T - The element type stored inside the B-Tree. + * COMP - A comparator to compare two nodes, it must return exactly: -1, 0, + * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons, + * TREE_CMP() from avl.h can be used in a boilerplate function. + */ +/* BEGIN CSTYLED */ +#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \ +_Pragma("GCC diagnostic push") \ +_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ +static void * \ +NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \ + const void *value, zfs_btree_index_t *where) \ +{ \ + T *i = (T *)buf; \ + (void) tree; \ + _Pragma("GCC unroll 9") \ + while (nelems > 1) { \ + uint32_t half = nelems / 2; \ + nelems -= half; \ + i += (COMP(&i[half - 1], value) < 0) * half; \ + } \ + \ + int comp = COMP(i, value); \ + where->bti_offset = (i - (T *)buf) + (comp < 0); \ + where->bti_before = (comp != 0); \ + \ + if (comp == 0) { \ + return (i); \ + } \ + \ + return (NULL); \ +} \ +_Pragma("GCC diagnostic pop") +/* END CSTYLED */ /* * Allocate and deallocate caches for btree nodes. @@ -131,10 +181,19 @@ void zfs_btree_fini(void); * tree - the tree to be initialized * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * -1 for <, 0 for ==, and +1 for > + * find - optional function to accelerate searches inside B-Tree nodes + * through Shar's algorithm and comparator inlining. Setting this to + * NULL will use a generic function. The function should be created + * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar. + * compar should be marked `__attribute__((always_inline)) inline` or + * performance is unlikely to improve very much. * size - the value of sizeof(struct my_type) + * lsize - custom leaf size */ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), - size_t); + bt_find_in_buf_f, size_t); +void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *), + bt_find_in_buf_f, size_t, size_t); /* * Find a node with a matching value in the tree. Returns the matching node diff --git a/include/sys/crypto/api.h b/include/sys/crypto/api.h index b3d6c9c071b9..88e0ac4d9699 100644 --- a/include/sys/crypto/api.h +++ b/include/sys/crypto/api.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/crypto/common.h b/include/sys/crypto/common.h index 45a95d7eed71..261e88eceeea 100644 --- a/include/sys/crypto/common.h +++ b/include/sys/crypto/common.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/crypto/icp.h b/include/sys/crypto/icp.h index ae7f7eae529e..8c3f19886fd8 100644 --- a/include/sys/crypto/icp.h +++ b/include/sys/crypto/icp.h @@ -7,7 +7,7 @@ * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dataset_kstats.h b/include/sys/dataset_kstats.h index b165b98576dd..c81a07f0c116 100644 --- a/include/sys/dataset_kstats.h +++ b/include/sys/dataset_kstats.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -30,6 +30,7 @@ #include <sys/wmsum.h> #include <sys/dmu.h> #include <sys/kstat.h> +#include <sys/zil.h> typedef struct dataset_sum_stats_t { wmsum_t dss_writes; @@ -56,15 +57,21 @@ typedef struct dataset_kstat_values { * entry is removed from the unlinked set */ kstat_named_t dkv_nunlinked; + /* + * Per dataset zil kstats + */ + zil_kstat_values_t dkv_zil_stats; } dataset_kstat_values_t; typedef struct dataset_kstats { dataset_sum_stats_t dk_sums; + zil_sums_t dk_zil_sums; kstat_t *dk_kstats; } dataset_kstats_t; -void dataset_kstats_create(dataset_kstats_t *, objset_t *); +int dataset_kstats_create(dataset_kstats_t *, objset_t *); void dataset_kstats_destroy(dataset_kstats_t *); +void dataset_kstats_rename(dataset_kstats_t *dk, const char *); void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 959e111cee7a..3808a04cba80 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -55,26 +55,31 @@ extern "C" { #define DB_RF_NEVERWAIT (1 << 4) #define DB_RF_CACHED (1 << 5) #define DB_RF_NO_DECRYPT (1 << 6) +#define DB_RF_PARTIAL_FIRST (1 << 7) +#define DB_RF_PARTIAL_MORE (1 << 8) /* * The simplified state transition diagram for dbufs looks like: * - * +----> READ ----+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ ^ - * | | | - * +----> FILL ----+ | - * | | - * | | - * +--------> NOFILL -------+ + * +--> READ --+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * ^ | ^ ^ + * | | | | + * | +--> FILL --+ | + * | | | + * | | | + * | +------> NOFILL -----+ + * | | + * +---------------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range * to find all dbufs in a range of a dnode and must be less than any other * dbuf_states_t (see comment on dn_dbufs in dnode.h). */ typedef enum dbuf_states { + DB_MARKER = -2, DB_SEARCH = -1, DB_UNCACHED, DB_FILL, @@ -170,6 +175,7 @@ typedef struct dbuf_dirty_record { override_states_t dr_override_state; uint8_t dr_copies; boolean_t dr_nopwrite; + boolean_t dr_brtwrite; boolean_t dr_has_raw_params; /* @@ -190,7 +196,7 @@ typedef struct dbuf_dirty_record { uint64_t dr_blkid; abd_t *dr_abd; zio_prop_t dr_props; - enum zio_flag dr_flags; + zio_flag_t dr_flags; } dll; } dt; } dbuf_dirty_record_t; @@ -294,6 +300,8 @@ typedef struct dmu_buf_impl { /* Tells us which dbuf cache this dbuf is in, if any */ dbuf_cached_state_t db_caching_status; + uint64_t db_hash; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -319,14 +327,19 @@ typedef struct dmu_buf_impl { uint8_t db_pending_evict; uint8_t db_dirtycnt; + + /* The buffer was partially read. More reads may follow. */ + uint8_t db_partial_read; } dmu_buf_impl_t; -#define DBUF_RWLOCKS 8192 -#define DBUF_HASH_RWLOCK(h, idx) (&(h)->hash_rwlocks[(idx) & (DBUF_RWLOCKS-1)]) +#define DBUF_HASH_MUTEX(h, idx) \ + (&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)]) + typedef struct dbuf_hash_table { uint64_t hash_table_mask; + uint64_t hash_mutex_mask; dmu_buf_impl_t **hash_table; - krwlock_t hash_rwlocks[DBUF_RWLOCKS] ____cacheline_aligned; + kmutex_t *hash_mutexes; } dbuf_hash_table_t; typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); @@ -362,23 +375,25 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting); dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, - uint64_t blkid); + uint64_t blkid, uint64_t *hash_out); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); +boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); +boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, - const struct zio_prop *zp, enum zio_flag flags, dmu_tx_t *tx); + const struct zio_prop *zp, zio_flag_t flags, dmu_tx_t *tx); void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx); void dbuf_destroy(dmu_buf_impl_t *db); diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 25be6f56dddc..726f1a3902eb 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_DDT_H @@ -39,32 +40,50 @@ extern "C" { struct abd; /* - * On-disk DDT formats, in the desired search order (newest version first). + * DDT on-disk storage object types. Each one corresponds to specific + * implementation, see ddt_ops_t. The value itself is not stored on disk. + * + * When searching for an entry, objects types will be searched in this order. + * + * Note that DDT_TYPES is used as the "no type" for new entries that have not + * yet been written to a storage object. */ -enum ddt_type { - DDT_TYPE_ZAP = 0, +typedef enum { + DDT_TYPE_ZAP = 0, /* ZAP storage object, ddt_zap */ DDT_TYPES -}; +} ddt_type_t; + +_Static_assert(DDT_TYPES <= UINT8_MAX, + "ddt_type_t must fit in a uint8_t"); + +/* New and updated entries recieve this type, see ddt_sync_entry() */ +#define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP) /* - * DDT classes, in the desired search order (highest replication level first). + * DDT storage classes. Each class has a separate storage object for each type. + * The value itself is not stored on disk. + * + * When search for an entry, object classes will be searched in this order. + * + * Note that DDT_CLASSES is used as the "no class" for new entries that have not + * yet been written to a storage object. */ -enum ddt_class { - DDT_CLASS_DITTO = 0, - DDT_CLASS_DUPLICATE, - DDT_CLASS_UNIQUE, +typedef enum { + DDT_CLASS_DITTO = 0, /* entry has ditto blocks (obsolete) */ + DDT_CLASS_DUPLICATE, /* entry has multiple references */ + DDT_CLASS_UNIQUE, /* entry has a single reference */ DDT_CLASSES -}; - -#define DDT_TYPE_CURRENT 0 +} ddt_class_t; -#define DDT_COMPRESS_BYTEORDER_MASK 0x80 -#define DDT_COMPRESS_FUNCTION_MASK 0x7f +_Static_assert(DDT_CLASSES < UINT8_MAX, + "ddt_class_t must fit in a uint8_t"); /* - * On-disk ddt entry: key (name) and physical storage (value). + * The "key" part of an on-disk entry. This is the unique "name" for a block, + * that is, that parts of the block pointer that will always be the same for + * the same data. */ -typedef struct ddt_key { +typedef struct { zio_cksum_t ddk_cksum; /* 256-bit block checksum */ /* * Encoded with logical & physical size, encryption, and compression, @@ -76,6 +95,10 @@ typedef struct ddt_key { uint64_t ddk_prop; } ddt_key_t; +/* + * Macros for accessing parts of a ddt_key_t. These are similar to their BP_* + * counterparts. + */ #define DDK_GET_LSIZE(ddk) \ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) #define DDK_SET_LSIZE(ddk, x) \ @@ -92,18 +115,25 @@ typedef struct ddt_key { #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) -#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) - -#define DDE_GET_NDVAS(dde) (DDK_GET_CRYPT(&dde->dde_key) \ - ? SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP) - -typedef struct ddt_phys { +/* + * The "value" part for an on-disk entry. These are the "physical" + * characteristics of the stored block, such as its location on disk (DVAs), + * birth txg and ref count. + * + * Note that an entry has an array of four ddt_phys_t, one for each number of + * DVAs (copies= property) and another for additional "ditto" copies. Most + * users of ddt_phys_t will handle indexing into or counting the phys they + * want. + */ +typedef struct { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; uint64_t ddp_phys_birth; } ddt_phys_t; /* + * Named indexes into the ddt_phys_t array in each entry. + * * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, * we maintain the ability to free existing dedup-ditto blocks. */ @@ -116,99 +146,83 @@ enum ddt_phys_type { }; /* - * In-core ddt entry + * A "live" entry, holding changes to an entry made this txg, and other data to + * support loading, updating and repairing the entry. */ -struct ddt_entry { - ddt_key_t dde_key; - ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + +/* State flags for dde_flags */ +#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ + +typedef struct { + /* key must be first for ddt_key_compare */ + ddt_key_t dde_key; /* ddt_tree key */ + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */ + + /* in-flight update IOs */ zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + + /* copy of data after a repair read, to be rewritten */ struct abd *dde_repair_abd; - enum ddt_type dde_type; - enum ddt_class dde_class; - uint8_t dde_loading; - uint8_t dde_loaded; - kcondvar_t dde_cv; - avl_node_t dde_node; -}; + + /* storage type and class the entry was loaded from */ + ddt_type_t dde_type; + ddt_class_t dde_class; + + uint8_t dde_flags; /* load state flags */ + kcondvar_t dde_cv; /* signaled when load completes */ + + avl_node_t dde_node; /* ddt_tree node */ +} ddt_entry_t; /* - * In-core ddt + * In-core DDT object. This covers all entries and stats for a the whole pool + * for a given checksum type. */ -struct ddt { - kmutex_t ddt_lock; - avl_tree_t ddt_tree; - avl_tree_t ddt_repair_tree; - enum zio_checksum ddt_checksum; - spa_t *ddt_spa; - objset_t *ddt_os; - uint64_t ddt_stat_object; +typedef struct { + kmutex_t ddt_lock; /* protects changes to all fields */ + + avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ + + avl_tree_t ddt_repair_tree; /* entries being repaired */ + + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ + spa_t *ddt_spa; /* pool this ddt is on */ + objset_t *ddt_os; /* ddt objset (always MOS) */ + + /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + + /* object ids for whole-ddt and per-type/per-class stats */ + uint64_t ddt_stat_object; + ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; + + /* type/class stats by power-2-sized referenced blocks */ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; - ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; - avl_node_t ddt_node; -}; +} ddt_t; /* - * In-core and on-disk bookmark for DDT walks + * In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(), + * and is stable across calls, even if the DDT is updated, the pool is + * restarted or loaded on another system, or OpenZFS is upgraded. */ -typedef struct ddt_bookmark { +typedef struct { uint64_t ddb_class; uint64_t ddb_type; uint64_t ddb_checksum; uint64_t ddb_cursor; } ddt_bookmark_t; -/* - * Ops vector to access a specific DDT object type. - */ -typedef struct ddt_ops { - char ddt_op_name[32]; - int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, - boolean_t prehash); - int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); - int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); - void (*ddt_op_prefetch)(objset_t *os, uint64_t object, - ddt_entry_t *dde); - int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, - dmu_tx_t *tx); - int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, - dmu_tx_t *tx); - int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, - uint64_t *walk); - int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); -} ddt_ops_t; - -#define DDT_NAMELEN 107 - -extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, char *name); -extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, uint64_t *walk, ddt_entry_t *dde); -extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, uint64_t *count); -extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, dmu_object_info_t *); -extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz); - extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg); extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp); -extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); - extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); extern void ddt_phys_clear(ddt_phys_t *ddp); extern void ddt_phys_addref(ddt_phys_t *ddp); extern void ddt_phys_decref(ddt_phys_t *ddp); -extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, - uint64_t txg); extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); -extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); - -extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); @@ -220,9 +234,6 @@ extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); extern uint64_t ddt_get_dedup_dspace(spa_t *spa); extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); -extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); -extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); - extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); @@ -232,23 +243,21 @@ extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); -extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, +extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp); extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); -extern int ddt_entry_compare(const void *x1, const void *x2); +extern int ddt_key_compare(const void *x1, const void *x2); extern void ddt_create(spa_t *spa); extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); -extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx); -extern const ddt_ops_t ddt_zap_ops; +extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); #ifdef __cplusplus } diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h new file mode 100644 index 000000000000..52b927b7519d --- /dev/null +++ b/include/sys/ddt_impl.h @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. + */ + +#ifndef _SYS_DDT_IMPL_H +#define _SYS_DDT_IMPL_H + +#include <sys/ddt.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Ops vector to access a specific DDT object type. + */ +typedef struct { + char ddt_op_name[32]; + int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, + boolean_t prehash); + int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, + const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + int (*ddt_op_contains)(objset_t *os, uint64_t object, + const ddt_key_t *ddk); + void (*ddt_op_prefetch)(objset_t *os, uint64_t object, + const ddt_key_t *ddk); + int (*ddt_op_update)(objset_t *os, uint64_t object, + const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, + dmu_tx_t *tx); + int (*ddt_op_remove)(objset_t *os, uint64_t object, + const ddt_key_t *ddk, dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, + ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); +} ddt_ops_t; + +extern const ddt_ops_t ddt_zap_ops; + +extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); + +/* + * These are only exposed so that zdb can access them. Try not to use them + * outside of the DDT implementation proper, and if you do, consider moving + * them up. + */ + +/* + * Enough room to expand DMU_POOL_DDT format for all possible DDT + * checksum/class/type combinations. + */ +#define DDT_NAMELEN 32 + +extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); + +extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); + +extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); + +extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + char *name); +extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + uint64_t *walk, ddt_entry_t *dde); +extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + uint64_t *count); +extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + dmu_object_info_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DDT_H */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 7c55a5c26189..b5fed64da4ad 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -27,6 +27,7 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -136,18 +137,24 @@ typedef enum dmu_object_byteswap { #endif #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_METADATA) : \ + (((ot) & DMU_OT_METADATA) != 0) : \ DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) +#define DMU_OT_IS_CRITICAL(ot) \ + (DMU_OT_IS_METADATA(ot) && \ + (ot) != DMU_OT_DNODE && \ + (ot) != DMU_OT_DIRECTORY_CONTENTS && \ + (ot) != DMU_OT_SA) + /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ #define DMU_OT_IS_FILE(ot) \ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) #define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_ENCRYPTED) : \ + (((ot) & DMU_OT_ENCRYPTED) != 0) : \ DMU_OT_IS_ENCRYPTED_IMPL(ot)) /* @@ -371,6 +378,7 @@ typedef struct dmu_buf { #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" +#define DMU_POOL_ERRORSCRUB "error_scrub" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" @@ -564,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + const void *tag, dmu_buf_t **dbp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags); +int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, + dmu_buf_t **dbp); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. @@ -640,6 +652,9 @@ typedef struct dmu_buf_user { */ taskq_ent_t dbu_tqent; + /* Size of user data, for inclusion in dbuf_cache accounting. */ + uint64_t dbu_size; + /* * This instance's eviction function pointers. * @@ -722,13 +737,21 @@ void *dmu_buf_replace_user(dmu_buf_t *db, void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* + * User data size accounting. This can be used to artifically inflate the size + * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough + * to satisfy memory reclaim requests. It's not used for anything else, and + * defaults to 0. + */ +uint64_t dmu_buf_user_size(dmu_buf_t *db); +void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd); +void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub); + +/* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); -dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); -void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); @@ -775,6 +798,11 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); +void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); +void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, @@ -865,13 +893,16 @@ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf -extern int zfs_max_recordsize; +extern uint_t zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); +void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); +void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ @@ -907,7 +938,7 @@ typedef const struct dmu_object_byteswap_info { } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; -extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; +extern dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. @@ -1052,6 +1083,11 @@ int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); +int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, struct blkptr *bps, size_t *nbpsp); +int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps); + /* * Initial setup and final teardown. */ @@ -1070,7 +1106,7 @@ int dmu_diff(const char *tosnap_name, const char *fromsnap_name, #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; -extern int dmu_prefetch_max; +extern uint_t dmu_prefetch_max; #ifdef __cplusplus } diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 95ec11ce2400..83ae2b76ba1f 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -247,8 +247,6 @@ typedef struct dmu_sendstatus { void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); -int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, - const void *, dmu_buf_t **); #ifdef __cplusplus } diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 782338fd210a..a9123e862af7 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -72,6 +72,10 @@ struct dmu_tx; */ #define OBJSET_CRYPT_PORTABLE_FLAGS_MASK (0) +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wgnu-variable-sized-type-not-at-end" +#endif typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; @@ -88,6 +92,9 @@ typedef struct objset_phys { char os_pad1[OBJSET_PHYS_SIZE_V3 - OBJSET_PHYS_SIZE_V2 - sizeof (dnode_phys_t)]; } objset_phys_t; +#if defined(__clang__) +#pragma clang diagnostic pop +#endif typedef int (*dmu_objset_upgrade_cb_t)(objset_t *); @@ -125,6 +132,7 @@ struct objset { zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; + zfs_prefetch_type_t os_prefetch; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; uint64_t os_recordsize; diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index 1fdb986e2ed6..3390ca1089f8 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ #ifndef _DMU_RECV_H @@ -47,6 +48,7 @@ typedef struct dmu_recv_cookie { boolean_t drc_byteswap; uint64_t drc_featureflags; boolean_t drc_force; + boolean_t drc_heal; boolean_t drc_resumable; boolean_t drc_should_save; boolean_t drc_raw; @@ -77,8 +79,8 @@ typedef struct dmu_recv_cookie { objlist_t *drc_ignore_objlist; } dmu_recv_cookie_t; -int dmu_recv_begin(char *, char *, dmu_replay_record_t *, - boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *, +int dmu_recv_begin(const char *, const char *, dmu_replay_record_t *, + boolean_t, boolean_t, boolean_t, nvlist_t *, nvlist_t *, const char *, dmu_recv_cookie_t *, zfs_file_t *, offset_t *); int dmu_recv_stream(dmu_recv_cookie_t *, offset_t *); int dmu_recv_end(dmu_recv_cookie_t *, void *); diff --git a/include/sys/dmu_redact.h b/include/sys/dmu_redact.h index 85f4b0522891..c18e2be103b7 100644 --- a/include/sys/dmu_redact.h +++ b/include/sys/dmu_redact.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index d150f816c945..061b81532fb1 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h index d76bfe3c9af3..7a0b38da7302 100644 --- a/include/sys/dmu_traverse.h +++ b/include/sys/dmu_traverse.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index ad3f1b0e47ca..aa55da626149 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -90,6 +90,8 @@ enum dmu_tx_hold_type { THT_ZAP, THT_SPACE, THT_SPILL, + THT_CLONE, + THT_APPEND, THT_NUMTYPES }; diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index cd1b79eb8e44..322472fb1ae2 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -36,8 +36,6 @@ extern "C" { #endif -extern unsigned long zfetch_array_rd_sz; - struct dnode; /* so we can reference dnode */ typedef struct zfetch { @@ -47,18 +45,24 @@ typedef struct zfetch { int zf_numstreams; /* number of zstream_t's */ } zfetch_t; +typedef struct zsrange { + uint16_t start; + uint16_t end; +} zsrange_t; + +#define ZFETCH_RANGES 9 /* Fits zstream_t into 128 bytes */ + typedef struct zstream { + list_node_t zs_node; /* link for zf_stream */ uint64_t zs_blkid; /* expect next access at this blkid */ + uint_t zs_atime; /* time last prefetch issued */ + zsrange_t zs_ranges[ZFETCH_RANGES]; /* ranges from future */ unsigned int zs_pf_dist; /* data prefetch distance in bytes */ unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */ uint64_t zs_pf_start; /* first data block to prefetch */ uint64_t zs_pf_end; /* data block to prefetch up to */ uint64_t zs_ipf_start; /* first data block to prefetch L1 */ uint64_t zs_ipf_end; /* data block to prefetch L1 up to */ - - list_node_t zs_node; /* link for zf_stream */ - hrtime_t zs_atime; /* time last prefetch issued */ - zfetch_t *zs_fetch; /* parent fetch */ boolean_t zs_missed; /* stream saw cache misses */ boolean_t zs_more; /* need more distant prefetch */ zfs_refcount_t zs_callers; /* number of pending callers */ @@ -76,7 +80,7 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t); -void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t); +void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, boolean_t); diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 9745ae5bb651..dbe7350d4da7 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -36,6 +36,7 @@ #include <sys/dmu_zfetch.h> #include <sys/zrlock.h> #include <sys/multilist.h> +#include <sys/wmsum.h> #ifdef __cplusplus extern "C" { @@ -119,7 +120,11 @@ extern "C" { #define DN_MAX_LEVELS (DIV_ROUND_UP(DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT, \ DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT) + 1) -#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ +/* + * Use the flexible array instead of the fixed length one dn_bonus + * to address memcpy/memmove fortify error + */ +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus_flexible + \ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) #define DN_MAX_BONUS_LEN(dnp) \ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ @@ -265,6 +270,10 @@ typedef struct dnode_phys { sizeof (blkptr_t)]; blkptr_t dn_spill; }; + struct { + blkptr_t __dn_ignore4; + uint8_t dn_bonus_flexible[]; + }; }; } dnode_phys_t; @@ -456,15 +465,11 @@ void dnode_free_interior_slots(dnode_t *dn); #define DNODE_IS_DIRTY(_dn) \ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) -#define DNODE_IS_CACHEABLE(_dn) \ +#define DNODE_LEVEL_IS_CACHEABLE(_dn, _level) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (DMU_OT_IS_METADATA((_dn)->dn_type) && \ + (((_level) > 0 || DMU_OT_IS_METADATA((_dn)->dn_type)) && \ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)) -#define DNODE_META_IS_CACHEABLE(_dn) \ - ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ - (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) - /* * Used for dnodestats kstat. */ @@ -587,10 +592,42 @@ typedef struct dnode_stats { kstat_named_t dnode_move_active; } dnode_stats_t; +typedef struct dnode_sums { + wmsum_t dnode_hold_dbuf_hold; + wmsum_t dnode_hold_dbuf_read; + wmsum_t dnode_hold_alloc_hits; + wmsum_t dnode_hold_alloc_misses; + wmsum_t dnode_hold_alloc_interior; + wmsum_t dnode_hold_alloc_lock_retry; + wmsum_t dnode_hold_alloc_lock_misses; + wmsum_t dnode_hold_alloc_type_none; + wmsum_t dnode_hold_free_hits; + wmsum_t dnode_hold_free_misses; + wmsum_t dnode_hold_free_lock_misses; + wmsum_t dnode_hold_free_lock_retry; + wmsum_t dnode_hold_free_refcount; + wmsum_t dnode_hold_free_overflow; + wmsum_t dnode_free_interior_lock_retry; + wmsum_t dnode_allocate; + wmsum_t dnode_reallocate; + wmsum_t dnode_buf_evict; + wmsum_t dnode_alloc_next_chunk; + wmsum_t dnode_alloc_race; + wmsum_t dnode_alloc_next_block; + wmsum_t dnode_move_invalid; + wmsum_t dnode_move_recheck1; + wmsum_t dnode_move_recheck2; + wmsum_t dnode_move_special; + wmsum_t dnode_move_handle; + wmsum_t dnode_move_rwlock; + wmsum_t dnode_move_active; +} dnode_sums_t; + extern dnode_stats_t dnode_stats; +extern dnode_sums_t dnode_sums; #define DNODE_STAT_INCR(stat, val) \ - atomic_add_64(&dnode_stats.stat.value.ui64, (val)); + wmsum_add(&dnode_sums.stat, (val)) #define DNODE_STAT_BUMP(stat) \ DNODE_STAT_INCR(stat, 1); diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h index 353c5c2d260f..d4e559a09037 100644 --- a/include/sys/dsl_bookmark.h +++ b/include/sys/dsl_bookmark.h @@ -72,6 +72,7 @@ typedef struct redaction_list_phys { typedef struct redaction_list { dmu_buf_user_t rl_dbu; redaction_list_phys_t *rl_phys; + dmu_buf_t *rl_bonus; dmu_buf_t *rl_dbuf; uint64_t rl_object; zfs_refcount_t rl_longholds; diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h index db594eece1c3..fbcae3715355 100644 --- a/include/sys/dsl_crypt.h +++ b/include/sys/dsl_crypt.h @@ -206,6 +206,7 @@ void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, dmu_tx_t *tx); int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, boolean_t *will_encrypt); +boolean_t dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb); void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, @@ -222,5 +223,6 @@ int spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb, dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt); +zfs_keystatus_t dsl_dataset_get_keystatus(dsl_dir_t *dd); #endif diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 25f86bce2e63..3450527af7e0 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -301,6 +301,14 @@ typedef struct dsl_dataset_snapshot_arg { proc_t *ddsa_proc; } dsl_dataset_snapshot_arg_t; +typedef struct dsl_dataset_rename_snapshot_arg { + const char *ddrsa_fsname; + const char *ddrsa_oldsnapname; + const char *ddrsa_newsnapname; + boolean_t ddrsa_recursive; + dmu_tx_t *ddrsa_tx; +} dsl_dataset_rename_snapshot_arg_t; + /* * The max length of a temporary tag prefix is the number of hex digits * required to express UINT64_MAX plus one for the hyphen. @@ -375,7 +383,6 @@ boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx); -void dsl_dataset_feature_set_activation(const blkptr_t *bp, dsl_dataset_t *ds); void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, @@ -474,6 +481,9 @@ void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx); int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, nvlist_t *result); +int dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx); +void dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx); + uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds); void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h index 64358bb5fc0b..3feb3bbf062f 100644 --- a/include/sys/dsl_deadlist.h +++ b/include/sys/dsl_deadlist.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -84,7 +84,7 @@ typedef struct livelist_condense_entry { boolean_t cancelled; } livelist_condense_entry_t; -extern unsigned long zfs_livelist_max_entries; +extern uint64_t zfs_livelist_max_entries; extern int zfs_livelist_min_percent_shared; typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h index 7f46233a889b..d6abac90bbcc 100644 --- a/include/sys/dsl_deleg.h +++ b/include/sys/dsl_deleg.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h index 208d75bacffa..1a9b672a260b 100644 --- a/include/sys/dsl_destroy.h +++ b/include/sys/dsl_destroy.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 6c097e372e44..f7c0d9acd10d 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -52,6 +52,7 @@ struct zthr; #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" #define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" #define DD_FIELD_LIVELIST "com.delphix:livelist" +#define DD_FIELD_SNAPSHOTS_CHANGED "com.ixsystems:snapshots_changed" typedef enum dd_used { DD_USED_HEAD, @@ -115,7 +116,7 @@ struct dsl_dir { /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; /* amount of space we expect to write; == amount of dirty data */ - int64_t dd_space_towrite[TXG_SIZE]; + uint64_t dd_space_towrite[TXG_SIZE]; dsl_deadlist_t dd_livelist; bplist_t dd_pending_frees; @@ -191,7 +192,7 @@ int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); -void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); +void dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx); inode_timespec_t dsl_dir_snap_cmtime(dsl_dir_t *dd); void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 5bb5ef20d5b1..abcdc77a4b96 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -57,13 +57,13 @@ struct dsl_scan; struct dsl_crypto_params; struct dsl_deadlist; -extern unsigned long zfs_dirty_data_max; -extern unsigned long zfs_dirty_data_max_max; -extern unsigned long zfs_wrlog_data_max; -extern int zfs_dirty_data_max_percent; -extern int zfs_dirty_data_max_max_percent; -extern int zfs_delay_min_dirty_percent; -extern unsigned long zfs_delay_scale; +extern uint64_t zfs_dirty_data_max; +extern uint64_t zfs_dirty_data_max_max; +extern uint64_t zfs_wrlog_data_max; +extern uint_t zfs_dirty_data_max_percent; +extern uint_t zfs_dirty_data_max_max_percent; +extern uint_t zfs_delay_min_dirty_percent; +extern uint64_t zfs_delay_scale; /* These macros are for indexing into the zfs_all_blkstats_t. */ #define DMU_OT_DEFERRED DMU_OT_NONE diff --git a/include/sys/dsl_prop.h b/include/sys/dsl_prop.h index fba8f908dc9e..7a84f2b6922a 100644 --- a/include/sys/dsl_prop.h +++ b/include/sys/dsl_prop.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index d716510f879d..2e3452e5ebaa 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -29,6 +29,7 @@ #include <sys/zfs_context.h> #include <sys/zio.h> +#include <sys/zap.h> #include <sys/ddt.h> #include <sys/bplist.h> @@ -60,7 +61,7 @@ typedef struct dsl_scan_phys { uint64_t scn_end_time; uint64_t scn_to_examine; /* total bytes to be scanned */ uint64_t scn_examined; /* bytes scanned so far */ - uint64_t scn_to_process; + uint64_t scn_skipped; /* bytes skipped by scanner */ uint64_t scn_processed; uint64_t scn_errors; /* scan I/O error count */ uint64_t scn_ddt_class_max; @@ -78,6 +79,21 @@ typedef enum dsl_scan_flags { #define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) +typedef struct dsl_errorscrub_phys { + uint64_t dep_func; /* pool_scan_func_t */ + uint64_t dep_state; /* dsl_scan_state_t */ + uint64_t dep_cursor; /* serialized zap cursor for tracing progress */ + uint64_t dep_start_time; /* error scrub start time, unix timestamp */ + uint64_t dep_end_time; /* error scrub end time, unix timestamp */ + uint64_t dep_to_examine; /* total error blocks to be scrubbed */ + uint64_t dep_examined; /* blocks scrubbed so far */ + uint64_t dep_errors; /* error scrub I/O error count */ + uint64_t dep_paused_flags; /* flag for paused */ +} dsl_errorscrub_phys_t; + +#define ERRORSCRUB_PHYS_NUMINTS (sizeof (dsl_errorscrub_phys_t) \ + / sizeof (uint64_t)) + /* * Every pool will have one dsl_scan_t and this structure will contain * in-memory information about the scan and a pointer to the on-disk @@ -151,11 +167,15 @@ typedef struct dsl_scan { uint64_t scn_avg_zio_size_this_txg; uint64_t scn_zios_this_txg; + /* zap cursor for tracing error scrub progress */ + zap_cursor_t errorscrub_cursor; /* members needed for syncing scan status to disk */ dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys_cached; avl_tree_t scn_queue; /* queue of datasets to scan */ uint64_t scn_queues_pending; /* outstanding data to issue */ + /* members needed for syncing error scrub status to disk */ + dsl_errorscrub_phys_t errorscrub_phys; } dsl_scan_t; typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; @@ -171,8 +191,12 @@ int dsl_scan_cancel(struct dsl_pool *); int dsl_scan(struct dsl_pool *, pool_scan_func_t); void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); -int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); +boolean_t dsl_errorscrubbing(const struct dsl_pool *dp); +boolean_t dsl_errorscrub_active(dsl_scan_t *scn); void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); +int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, + pool_scrub_cmd_t cmd); +void dsl_errorscrub_sync(struct dsl_pool *, dmu_tx_t *); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); @@ -184,6 +208,7 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, struct dmu_tx *tx); boolean_t dsl_scan_active(dsl_scan_t *scn); boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); +boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn); void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h index 5a5b306419f1..cbdb20ec1d3a 100644 --- a/include/sys/dsl_synctask.h +++ b/include/sys/dsl_synctask.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/dsl_userhold.h b/include/sys/dsl_userhold.h index 071aeb86d1f1..c165edab3c53 100644 --- a/include/sys/dsl_userhold.h +++ b/include/sys/dsl_userhold.h @@ -7,7 +7,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/edonr.h b/include/sys/edonr.h index 79b7cd8c75b8..b19b5eb42c29 100644 --- a/include/sys/edonr.h +++ b/include/sys/edonr.h @@ -1,6 +1,4 @@ /* - * IDI,NTNU - * * CDDL HEADER START * * The contents of this file are subject to the terms of the @@ -19,15 +17,13 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END - * - * Copyright (C) 2009, 2010, Jorn Amundsen <jorn.amundsen@ntnu.no> - * - * Tweaked Edon-R implementation for SUPERCOP, based on NIST API. - * - * $Id: edonr.h 517 2013-02-17 20:34:39Z joern $ */ + /* - * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved + * Based on Edon-R implementation for SUPERCOP, based on NIST API. + * Copyright (c) 2009, 2010 Jørn Amundsen <jorn.amundsen@ntnu.no> + * Copyright (c) 2013 Saso Kiselkov, All rights reserved + * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de> */ #ifndef _SYS_EDONR_H_ @@ -40,8 +36,8 @@ extern "C" { #ifdef _KERNEL #include <sys/types.h> #else -#include <stdint.h> /* uint32_t... */ -#include <stdlib.h> /* size_t ... */ +#include <stdint.h> +#include <stdlib.h> #endif /* @@ -52,44 +48,27 @@ extern "C" { */ /* Specific algorithm definitions */ -#define EdonR224_DIGEST_SIZE 28 -#define EdonR224_BLOCK_SIZE 64 -#define EdonR256_DIGEST_SIZE 32 -#define EdonR256_BLOCK_SIZE 64 -#define EdonR384_DIGEST_SIZE 48 -#define EdonR384_BLOCK_SIZE 128 #define EdonR512_DIGEST_SIZE 64 #define EdonR512_BLOCK_SIZE 128 - -#define EdonR256_BLOCK_BITSIZE 512 #define EdonR512_BLOCK_BITSIZE 1024 typedef struct { - uint32_t DoublePipe[16]; - uint8_t LastPart[EdonR256_BLOCK_SIZE * 2]; -} EdonRData256; -typedef struct { uint64_t DoublePipe[16]; uint8_t LastPart[EdonR512_BLOCK_SIZE * 2]; } EdonRData512; typedef struct { - size_t hashbitlen; - - /* + algorithm specific parameters */ - int unprocessed_bits; uint64_t bits_processed; + int unprocessed_bits; union { - EdonRData256 p256[1]; EdonRData512 p512[1]; } pipe[1]; } EdonRState; -void EdonRInit(EdonRState *state, size_t hashbitlen); +void EdonRInit(EdonRState *state); void EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen); void EdonRFinal(EdonRState *state, uint8_t *hashval); -void EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen, - uint8_t *hashval); +void EdonRHash(const uint8_t *data, size_t databitlen, uint8_t *hashval); #ifdef __cplusplus } diff --git a/include/sys/efi_partition.h b/include/sys/efi_partition.h index 7d5e42e945ad..c4d7fd5088b5 100644 --- a/include/sys/efi_partition.h +++ b/include/sys/efi_partition.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index cd080c8ee667..c746600cd2d5 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -78,6 +78,12 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N "vdev_cksum_n" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" @@ -98,8 +104,6 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" #define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" @@ -108,8 +112,6 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" #define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" #define FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME "snapshot_name" #define FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME "device_name" #define FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME "raw_name" diff --git a/include/sys/fm/protocol.h b/include/sys/fm/protocol.h index 78031f7c15ec..d4a9751c8aeb 100644 --- a/include/sys/fm/protocol.h +++ b/include/sys/fm/protocol.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/fm/util.h b/include/sys/fm/util.h index 5fb6d1d6072b..038162ab7524 100644 --- a/include/sys/fm/util.h +++ b/include/sys/fm/util.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -95,7 +95,7 @@ extern void fm_init(void); extern void fm_fini(void); extern void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector); extern int zfs_zevent_post(nvlist_t *, nvlist_t *, zevent_cb_t *); -extern void zfs_zevent_drain_all(int *); +extern void zfs_zevent_drain_all(uint_t *); extern zfs_file_t *zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **); extern void zfs_zevent_fd_rele(zfs_file_t *); extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f013e6b20603..e191420f2d2d 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -29,6 +29,7 @@ * Copyright (c) 2019 Datto Inc. * Portions Copyright 2010 Robert Milkowski * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #ifndef _SYS_FS_ZFS_H @@ -189,6 +190,9 @@ typedef enum { ZFS_PROP_IVSET_GUID, /* not exposed to the user */ ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, + ZFS_PROP_SNAPSHOTS_CHANGED, + ZFS_PROP_PREFETCH, + ZFS_PROP_VOLTHREADING, ZFS_NUM_PROPS } zfs_prop_t; @@ -251,6 +255,9 @@ typedef enum { ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, + ZPOOL_PROP_BCLONEUSED, + ZPOOL_PROP_BCLONESAVED, + ZPOOL_PROP_BCLONERATIO, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -353,6 +360,14 @@ typedef enum { VDEV_PROP_BYTES_TRIM, VDEV_PROP_REMOVING, VDEV_PROP_ALLOCATING, + VDEV_PROP_FAILFAST, + VDEV_PROP_CHECKSUM_N, + VDEV_PROP_CHECKSUM_T, + VDEV_PROP_IO_N, + VDEV_PROP_IO_T, + VDEV_PROP_RAIDZ_EXPANDING, + VDEV_PROP_SLOW_IO_N, + VDEV_PROP_SLOW_IO_T, VDEV_NUM_PROPS } vdev_prop_t; @@ -500,7 +515,9 @@ typedef enum { typedef enum { ZFS_REDUNDANT_METADATA_ALL, - ZFS_REDUNDANT_METADATA_MOST + ZFS_REDUNDANT_METADATA_MOST, + ZFS_REDUNDANT_METADATA_SOME, + ZFS_REDUNDANT_METADATA_NONE } zfs_redundant_metadata_type_t; typedef enum { @@ -531,6 +548,12 @@ typedef enum zfs_key_location { ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; +typedef enum { + ZFS_PREFETCH_NONE = 0, + ZFS_PREFETCH_METADATA = 1, + ZFS_PREFETCH_ALL = 2 +} zfs_prefetch_type_t; + #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 @@ -704,6 +727,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ +#define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ @@ -769,6 +793,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" @@ -804,6 +830,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ +#define ZPOOL_CONFIG_VDEV_ROOT_ZAP "com.klarasystems:vdev_zap_root" #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" @@ -886,6 +913,15 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ + "org.openzfs:raidz_expand_state" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ + "org.openzfs:raidz_expand_start_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ + "org.openzfs:raidz_expand_end_time" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ + "org.openzfs:raidz_expand_bytes_copied" + /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" @@ -1023,6 +1059,7 @@ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, + POOL_SCAN_ERRORSCRUB, POOL_SCAN_FUNCS } pool_scan_func_t; @@ -1057,12 +1094,18 @@ typedef enum zio_type { ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, - ZIO_TYPE_IOCTL, + ZIO_TYPE_FLUSH, ZIO_TYPE_TRIM, ZIO_TYPES } zio_type_t; /* + * Compatibility: _IOCTL was renamed to _FLUSH; keep the old name available to + * user programs. + */ +#define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH + +/* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ @@ -1074,7 +1117,7 @@ typedef struct pool_scan_stat { uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ - uint64_t pss_to_process; /* total bytes to process */ + uint64_t pss_skipped; /* total bytes skipped by scanner */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ @@ -1086,6 +1129,20 @@ typedef struct pool_scan_stat { uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ + + /* error scrub values stored on disk */ + uint64_t pss_error_scrub_func; /* pool_scan_func_t */ + uint64_t pss_error_scrub_state; /* dsl_scan_state_t */ + uint64_t pss_error_scrub_start; /* error scrub start time */ + uint64_t pss_error_scrub_end; /* error scrub end time */ + uint64_t pss_error_scrub_examined; /* error blocks issued I/O */ + /* error blocks to be issued I/O */ + uint64_t pss_error_scrub_to_be_examined; + + /* error scrub values not stored on disk */ + /* error scrub pause time in milliseconds */ + uint64_t pss_pass_error_scrub_pause; + } pool_scan_stat_t; typedef struct pool_removal_stat { @@ -1102,11 +1159,22 @@ typedef struct pool_removal_stat { uint64_t prs_mapping_memory; } pool_removal_stat_t; +typedef struct pool_raidz_expand_stat { + uint64_t pres_state; /* dsl_scan_state_t */ + uint64_t pres_expanding_vdev; + uint64_t pres_start_time; + uint64_t pres_end_time; + uint64_t pres_to_reflow; /* bytes that need to be moved */ + uint64_t pres_reflowed; /* bytes moved so far */ + uint64_t pres_waiting_for_resilver; +} pool_raidz_expand_stat_t; + typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, + DSS_ERRORSCRUBBING, DSS_NUM_STATES } dsl_scan_state_t; @@ -1123,6 +1191,7 @@ typedef struct vdev_rebuild_stat { uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ + uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* @@ -1252,6 +1321,7 @@ typedef enum pool_initialize_func { POOL_INITIALIZE_START, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, + POOL_INITIALIZE_UNINIT, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; @@ -1346,7 +1416,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 81/128 numbers reserved. + * Core features - 88/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1441,6 +1511,7 @@ typedef enum zfs_ioc { ZFS_IOC_WAIT_FS, /* 0x5a54 */ ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ + ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1535,6 +1606,10 @@ typedef enum { ZFS_ERR_BADPROP, ZFS_ERR_VDEV_NOTSUP, ZFS_ERR_NOT_USER_NAMESPACE, + ZFS_ERR_RESUME_EXISTS, + ZFS_ERR_CRYPTO_NOTSUP, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, + ZFS_ERR_ASHIFT_MISMATCH, } zfs_errno_t; /* @@ -1559,6 +1634,7 @@ typedef enum { ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, + ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; @@ -1651,6 +1727,7 @@ typedef enum { #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 +#define ZFS_ONLINE_SPARE 0x10 #define ZFS_OFFLINE_TEMPORARY 0x1 /* @@ -1757,9 +1834,9 @@ typedef enum { * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * The recordsize property can not be set larger than zfs_max_recordsize + * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near + * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 140739adb562..815b5d0c9cf1 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -39,6 +39,7 @@ extern "C" { typedef struct metaslab_ops { + const char *msop_name; uint64_t (*msop_alloc)(metaslab_t *, uint64_t); } metaslab_ops_t; @@ -80,7 +81,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_ASYNC_ALLOC 0x8 #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 -#define METASLAB_FASTWRITE 0x40 #define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, @@ -96,8 +96,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); void metaslab_check_free(spa_t *, const blkptr_t *); -void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); -void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 820c61a252e2..4f434291ddbf 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -250,7 +250,6 @@ struct metaslab_group { int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; - taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; @@ -313,7 +312,7 @@ struct metaslab_group { * Each metaslab maintains a set of in-core trees to track metaslab * operations. The in-core free tree (ms_allocatable) contains the list of * free segments which are eligible for allocation. As blocks are - * allocated, the allocated segment are removed from the ms_allocatable and + * allocated, the allocated segments are removed from the ms_allocatable and * added to a per txg allocation tree (ms_allocating). As blocks are * freed, they are added to the free tree (ms_freeing). These trees * allow us to process all allocations and frees in syncing context @@ -366,9 +365,9 @@ struct metaslab_group { struct metaslab { /* * This is the main lock of the metaslab and its purpose is to - * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * coordinate our allocations and frees [e.g., metaslab_block_alloc(), * metaslab_free_concrete(), ..etc] with our various syncing - * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. * * The lock is also used during some miscellaneous operations like * using the metaslab's histogram for the metaslab group's histogram diff --git a/include/sys/mmp.h b/include/sys/mmp.h index ce9c4496a04f..1023334098d8 100644 --- a/include/sys/mmp.h +++ b/include/sys/mmp.h @@ -64,7 +64,7 @@ extern void mmp_signal_all_threads(void); /* Global tuning */ extern int param_set_multihost_interval(ZFS_MODULE_PARAM_ARGS); -extern ulong_t zfs_multihost_interval; +extern uint64_t zfs_multihost_interval; extern uint_t zfs_multihost_fail_intervals; extern uint_t zfs_multihost_import_intervals; diff --git a/include/sys/mntent.h b/include/sys/mntent.h index 8d578f67b8a7..5bb7e080cda8 100644 --- a/include/sys/mntent.h +++ b/include/sys/mntent.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -108,5 +108,8 @@ #define MNTOPT_NOACL "noacl" /* likewise */ #define MNTOPT_POSIXACL "posixacl" /* likewise */ #define MNTOPT_MNTPOINT "mntpoint" /* mount point hint */ +#define MNTOPT_CASESENSITIVE "casesensitive" /* case sensitivity */ +#define MNTOPT_CASEINSENSITIVE "caseinsensitive" /* case insensitivity */ +#define MNTOPT_CASEMIXED "casemixed" /* case mixed */ #endif /* _SYS_MNTENT_H */ diff --git a/include/sys/multilist.h b/include/sys/multilist.h index 26f37c37ab38..e7de86f2379b 100644 --- a/include/sys/multilist.h +++ b/include/sys/multilist.h @@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *); unsigned int multilist_get_num_sublists(multilist_t *); unsigned int multilist_get_random_index(multilist_t *); -multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_lock(multilist_sublist_t *); +multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int); multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); void multilist_sublist_unlock(multilist_sublist_t *); void multilist_sublist_insert_head(multilist_sublist_t *, void *); void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *); +void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *); void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); void multilist_sublist_remove(multilist_sublist_t *, void *); int multilist_sublist_is_empty(multilist_sublist_t *); diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h index 81494b62d7ec..2dbd9e3eaf46 100644 --- a/include/sys/nvpair.h +++ b/include/sys/nvpair.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -76,7 +76,7 @@ typedef struct nvpair { int16_t nvp_reserve; /* not used */ int32_t nvp_value_elem; /* number of elements for array types */ data_type_t nvp_type; /* type of value */ - /* name string */ + char nvp_name[]; /* name string */ /* aligned ptr array for string arrays */ /* aligned array of data for value */ } nvpair_t; @@ -109,7 +109,7 @@ typedef struct nvlist { #define NV_ALIGN4(x) (((x) + 3) & ~3) #define NVP_SIZE(nvp) ((nvp)->nvp_size) -#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t)) +#define NVP_NAME(nvp) ((nvp)->nvp_name) #define NVP_TYPE(nvp) ((nvp)->nvp_type) #define NVP_NELEM(nvp) ((nvp)->nvp_value_elem) #define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \ @@ -232,7 +232,8 @@ _SYS_NVPAIR_H int nvlist_lookup_int64(const nvlist_t *, const char *, int64_t *); _SYS_NVPAIR_H int nvlist_lookup_uint64(const nvlist_t *, const char *, uint64_t *); -_SYS_NVPAIR_H int nvlist_lookup_string(nvlist_t *, const char *, char **); +_SYS_NVPAIR_H int nvlist_lookup_string(const nvlist_t *, const char *, + const char **); _SYS_NVPAIR_H int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); _SYS_NVPAIR_H int nvlist_lookup_boolean_array(nvlist_t *, const char *, boolean_t **, uint_t *); @@ -267,14 +268,14 @@ _SYS_NVPAIR_H int nvlist_lookup_double(const nvlist_t *, const char *, _SYS_NVPAIR_H int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); _SYS_NVPAIR_H int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, - nvpair_t **, int *, char **); + nvpair_t **, int *, const char **); _SYS_NVPAIR_H boolean_t nvlist_exists(const nvlist_t *, const char *); _SYS_NVPAIR_H boolean_t nvlist_empty(const nvlist_t *); /* processing nvpair */ _SYS_NVPAIR_H nvpair_t *nvlist_next_nvpair(nvlist_t *, const nvpair_t *); _SYS_NVPAIR_H nvpair_t *nvlist_prev_nvpair(nvlist_t *, const nvpair_t *); -_SYS_NVPAIR_H char *nvpair_name(const nvpair_t *); +_SYS_NVPAIR_H const char *nvpair_name(const nvpair_t *); _SYS_NVPAIR_H data_type_t nvpair_type(const nvpair_t *); _SYS_NVPAIR_H int nvpair_type_is_array(const nvpair_t *); _SYS_NVPAIR_H int nvpair_value_boolean_value(const nvpair_t *, boolean_t *); @@ -287,7 +288,7 @@ _SYS_NVPAIR_H int nvpair_value_int32(const nvpair_t *, int32_t *); _SYS_NVPAIR_H int nvpair_value_uint32(const nvpair_t *, uint32_t *); _SYS_NVPAIR_H int nvpair_value_int64(const nvpair_t *, int64_t *); _SYS_NVPAIR_H int nvpair_value_uint64(const nvpair_t *, uint64_t *); -_SYS_NVPAIR_H int nvpair_value_string(nvpair_t *, char **); +_SYS_NVPAIR_H int nvpair_value_string(const nvpair_t *, const char **); _SYS_NVPAIR_H int nvpair_value_nvlist(nvpair_t *, nvlist_t **); _SYS_NVPAIR_H int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); @@ -300,7 +301,8 @@ _SYS_NVPAIR_H int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); _SYS_NVPAIR_H int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); _SYS_NVPAIR_H int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); _SYS_NVPAIR_H int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); -_SYS_NVPAIR_H int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); +_SYS_NVPAIR_H int nvpair_value_string_array(nvpair_t *, const char ***, + uint_t *); _SYS_NVPAIR_H int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); _SYS_NVPAIR_H int nvpair_value_hrtime(nvpair_t *, hrtime_t *); #if !defined(_KERNEL) && !defined(_STANDALONE) @@ -373,7 +375,8 @@ _SYS_NVPAIR_H uint8_t fnvlist_lookup_uint8(const nvlist_t *, const char *); _SYS_NVPAIR_H uint16_t fnvlist_lookup_uint16(const nvlist_t *, const char *); _SYS_NVPAIR_H uint32_t fnvlist_lookup_uint32(const nvlist_t *, const char *); _SYS_NVPAIR_H uint64_t fnvlist_lookup_uint64(const nvlist_t *, const char *); -_SYS_NVPAIR_H char *fnvlist_lookup_string(nvlist_t *, const char *); +_SYS_NVPAIR_H const char *fnvlist_lookup_string(const nvlist_t *, + const char *); _SYS_NVPAIR_H nvlist_t *fnvlist_lookup_nvlist(nvlist_t *, const char *); _SYS_NVPAIR_H boolean_t *fnvlist_lookup_boolean_array(nvlist_t *, const char *, uint_t *); @@ -406,7 +409,7 @@ _SYS_NVPAIR_H uint8_t fnvpair_value_uint8(const nvpair_t *nvp); _SYS_NVPAIR_H uint16_t fnvpair_value_uint16(const nvpair_t *nvp); _SYS_NVPAIR_H uint32_t fnvpair_value_uint32(const nvpair_t *nvp); _SYS_NVPAIR_H uint64_t fnvpair_value_uint64(const nvpair_t *nvp); -_SYS_NVPAIR_H char *fnvpair_value_string(nvpair_t *nvp); +_SYS_NVPAIR_H const char *fnvpair_value_string(const nvpair_t *nvp); _SYS_NVPAIR_H nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); #ifdef __cplusplus diff --git a/include/sys/nvpair_impl.h b/include/sys/nvpair_impl.h index 809e5c454712..6cae256285e5 100644 --- a/include/sys/nvpair_impl.h +++ b/include/sys/nvpair_impl.h @@ -7,7 +7,7 @@ * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/pathname.h b/include/sys/pathname.h index 52f21316c23d..054223170db1 100644 --- a/include/sys/pathname.h +++ b/include/sys/pathname.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/qat.h b/include/sys/qat.h index fe0f2c672f97..76360ba99042 100644 --- a/include/sys/qat.h +++ b/include/sys/qat.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index daa39e20dbd6..d6f60e795288 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/rrwlock.h b/include/sys/rrwlock.h index 51ac364af519..367732a8391c 100644 --- a/include/sys/rrwlock.h +++ b/include/sys/rrwlock.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/sa.h b/include/sys/sa.h index 42479652ab2c..c551acecab30 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index fa10aff8a306..744c8dcb7dfb 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/sha2.h b/include/sys/sha2.h new file mode 100644 index 000000000000..81dfbbb8cea9 --- /dev/null +++ b/include/sys/sha2.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de> + */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#ifdef _KERNEL +#include <sys/types.h> +#else +#include <stdint.h> +#include <stdlib.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA224_BLOCK_LENGTH 64 +#define SHA256_BLOCK_LENGTH 64 +#define SHA384_BLOCK_LENGTH 128 +#define SHA512_BLOCK_LENGTH 128 + +#define SHA224_DIGEST_LENGTH 28 +#define SHA256_DIGEST_LENGTH 32 +#define SHA384_DIGEST_LENGTH 48 +#define SHA512_DIGEST_LENGTH 64 + +#define SHA512_224_DIGEST_LENGTH 28 +#define SHA512_256_DIGEST_LENGTH 32 + +#define SHA256_HMAC_BLOCK_SIZE 64 +#define SHA512_HMAC_BLOCK_SIZE 128 + +/* sha256 context */ +typedef struct { + uint32_t state[8]; + uint64_t count[2]; + uint8_t wbuf[64]; + + /* const sha256_ops_t *ops */ + const void *ops; +} sha256_ctx; + +/* sha512 context */ +typedef struct { + uint64_t state[8]; + uint64_t count[2]; + uint8_t wbuf[128]; + + /* const sha256_ops_t *ops */ + const void *ops; +} sha512_ctx; + +/* SHA2 context */ +typedef struct { + union { + sha256_ctx sha256; + sha512_ctx sha512; + }; + + /* algorithm type */ + int algotype; +} SHA2_CTX; + +/* SHA2 algorithm types */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* SHA2 Init function */ +extern void SHA2Init(int algotype, SHA2_CTX *ctx); + +/* SHA2 Update function */ +extern void SHA2Update(SHA2_CTX *ctx, const void *data, size_t len); + +/* SHA2 Final function */ +extern void SHA2Final(void *digest, SHA2_CTX *ctx); + +#ifdef __cplusplus +} +#endif + +#endif /* SYS_SHA2_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index cd2499b30b40..3073c4d1b937 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,16 +20,16 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2021 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Datto Inc. */ #ifndef _SYS_SPA_H @@ -62,9 +62,8 @@ typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; -typedef struct ddt ddt_t; -typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; +typedef struct zbookmark_err_phys zbookmark_err_phys_t; struct bpobj; struct bplist; @@ -126,15 +125,15 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | pad | vdev1 | GRID | ASIZE | + * 0 | pad | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | pad | vdev2 | GRID | ASIZE | + * 2 | pad | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | pad | vdev3 | GRID | ASIZE | + * 4 | pad | vdev3 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -166,7 +165,6 @@ typedef struct zio_cksum_salt { * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) - * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator @@ -191,11 +189,11 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | GRID | ASIZE | + * 0 | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | GRID | ASIZE | + * 2 | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -356,7 +354,7 @@ typedef enum bp_embedded_type { #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ - ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + ((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1])) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ @@ -375,8 +373,7 @@ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ - uint64_t blk_phys_birth; /* txg when block was allocated */ - uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_birth_word[2]; uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; @@ -396,9 +393,6 @@ typedef struct blkptr { BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) -#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) -#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) - #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) @@ -481,15 +475,23 @@ typedef struct blkptr { #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) -#define BP_PHYSICAL_BIRTH(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) +#define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1] +#define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x)) + +#define BP_GET_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] +#define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x)) + +#define BP_GET_BIRTH(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \ + BP_GET_LOGICAL_BIRTH(bp)) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ - (bp)->blk_birth = (logical); \ - (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ + BP_SET_LOGICAL_BIRTH(bp, logical); \ + BP_SET_PHYSICAL_BIRTH(bp, \ + ((logical) == (physical) ? 0 : (physical))); \ } #define BP_GET_FILL(bp) \ @@ -542,8 +544,8 @@ typedef struct blkptr { (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ - (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ - (bp1)->blk_birth == (bp2)->blk_birth && \ + (BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \ + BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) @@ -582,8 +584,8 @@ typedef struct blkptr { (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ - (bp)->blk_phys_birth = 0; \ - (bp)->blk_birth = 0; \ + (bp)->blk_birth_word[0] = 0; \ + (bp)->blk_birth_word[1] = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } @@ -600,7 +602,7 @@ typedef struct blkptr { /* * This macro allows code sharing between zfs, libzpool, and mdb. - * 'func' is either snprintf() or mdb_snprintf(). + * 'func' is either kmem_scnprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ @@ -632,7 +634,7 @@ typedef struct blkptr { (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ @@ -643,14 +645,14 @@ typedef struct blkptr { compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else if (BP_IS_REDACTED(bp)) { \ len += func(buf + len, size - len, \ "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ @@ -663,6 +665,7 @@ typedef struct blkptr { (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ + ASSERT3S(copies, >, 0); \ if (BP_IS_ENCRYPTED(bp)) { \ len += func(buf + len, size - len, \ "salt=%llx iv=%llx:%llx%c", \ @@ -678,7 +681,7 @@ typedef struct blkptr { len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ - "cksum=%llx:%llx:%llx:%llx", \ + "cksum=%016llx:%016llx:%016llx:%016llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ @@ -691,8 +694,8 @@ typedef struct blkptr { ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth, \ - (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \ + (u_longlong_t)BP_GET_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ @@ -721,16 +724,10 @@ typedef enum spa_mode { * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes - * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, -#ifdef IN_FREEBSD_BASE - SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, -#else - SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, -#endif } spa_autotrim_t; /* @@ -773,7 +770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 @@ -785,9 +782,10 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 +#define SPA_ASYNC_DETACH_SPARE 0x4000 /* device manipulation */ -extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, @@ -826,10 +824,19 @@ extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); -extern int zfs_sync_pass_deferred_free; +extern uint_t zfs_sync_pass_deferred_free; + +/* spa sync taskqueues */ +taskq_t *spa_sync_tq_create(spa_t *spa, const char *name); +void spa_sync_tq_destroy(spa_t *spa); +uint_t spa_acq_allocator(spa_t *spa); +void spa_rel_allocator(spa_t *spa, uint_t allocator); +void spa_select_allocator(zio_t *zio); /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; +extern avl_tree_t spa_namespace_avl; +extern kcondvar_t spa_namespace_cv; /* * SPA configuration functions in spa_config.c @@ -838,9 +845,9 @@ extern kmutex_t spa_namespace_lock; #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 -extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); +extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); +extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); @@ -969,11 +976,17 @@ extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); +extern void spa_import_progress_set_notes(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); +extern void spa_import_progress_set_notes_nolog(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); +extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, + krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); @@ -1013,7 +1026,7 @@ extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); -extern int spa_sync_pass(spa_t *spa); +extern uint32_t spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); @@ -1057,6 +1070,8 @@ extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); +extern int spa_get_allocator(spa_t *spa); +extern void spa_set_allocator(spa_t *spa, const char *allocator); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...) @@ -1110,6 +1125,8 @@ extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); +extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); + extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); @@ -1133,7 +1150,10 @@ extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; -extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); +extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, + const uint64_t birth); +extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, + uint64_t birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, @@ -1145,8 +1165,9 @@ extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); -extern uint64_t spa_get_errlog_size(spa_t *spa); +extern uint64_t spa_approx_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); +extern uint64_t spa_get_last_errlog_size(spa_t *spa); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); @@ -1157,10 +1178,13 @@ extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx); extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx); - -/* vdev cache */ -extern void vdev_cache_stat_init(void); -extern void vdev_cache_stat_fini(void); +extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds, + zbookmark_err_phys_t *zep, uint64_t *top_affected_fs); +extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg); +extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, + zbookmark_phys_t *zb); +extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); /* vdev mirror */ extern void vdev_mirror_stat_init(void); @@ -1201,6 +1225,7 @@ int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); +int param_set_active_allocator(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ @@ -1217,9 +1242,9 @@ int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); extern spa_mode_t spa_mode_global; extern int zfs_deadman_enabled; -extern unsigned long zfs_deadman_synctime_ms; -extern unsigned long zfs_deadman_ziotime_ms; -extern unsigned long zfs_deadman_checktime_ms; +extern uint64_t zfs_deadman_synctime_ms; +extern uint64_t zfs_deadman_ziotime_ms; +extern uint64_t zfs_deadman_checktime_ms; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; diff --git a/include/sys/spa_checkpoint.h b/include/sys/spa_checkpoint.h index 9be2b6eeab3c..e4475ff35f44 100644 --- a/include/sys/spa_checkpoint.h +++ b/include/sys/spa_checkpoint.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/spa_checksum.h b/include/sys/spa_checksum.h index b87990105a71..2202afdeb8da 100644 --- a/include/sys/spa_checksum.h +++ b/include/sys/spa_checksum.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 9946c4e3c316..a40914ec5fcb 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,13 +20,13 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. - * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_IMPL_H @@ -38,6 +38,7 @@ #include <sys/vdev.h> #include <sys/vdev_rebuild.h> #include <sys/vdev_removal.h> +#include <sys/vdev_raidz.h> #include <sys/metaslab.h> #include <sys/dmu.h> #include <sys/dsl_pool.h> @@ -62,10 +63,17 @@ typedef struct spa_alloc { avl_tree_t spaa_tree; } ____cacheline_aligned spa_alloc_t; +typedef struct spa_allocs_use { + kmutex_t sau_lock; + uint_t sau_rotor; + boolean_t sau_inuse[]; +} spa_allocs_use_t; + typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; avl_node_t se_avl; + zbookmark_err_phys_t se_zep; /* not accounted in avl_find */ } spa_error_entry_t; typedef struct spa_history_phys { @@ -187,6 +195,12 @@ typedef struct spa_taskqs { taskq_t **stqs_taskq; } spa_taskqs_t; +/* one for each thread in the spa sync taskq */ +typedef struct spa_syncthread_info { + kthread_t *sti_thread; + uint_t sti_allocator; +} spa_syncthread_info_t; + typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ @@ -215,7 +229,7 @@ struct spa { nvlist_t *spa_config_splitting; /* config for splitting */ nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ - int spa_sync_pass; /* iterate-to-convergence */ + uint32_t spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ @@ -229,6 +243,7 @@ struct spa { dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ + kthread_t *spa_load_thread; /* loading, no namespace lock */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ @@ -249,6 +264,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ @@ -260,10 +276,17 @@ struct spa { * allocation performance in write-heavy workloads. */ spa_alloc_t *spa_allocs; + spa_allocs_use_t *spa_allocs_use; int spa_alloc_count; + int spa_active_allocator; /* selectable allocator */ + + /* per-allocator sync thread taskqs */ + taskq_t *spa_sync_tq; + spa_syncthread_info_t *spa_syncthreads; spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + boolean_t spa_aux_sync_uber; /* need to sync aux uber */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ @@ -294,6 +317,10 @@ struct spa { uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ + /* error scrub pause time in milliseconds */ + uint64_t spa_scan_pass_errorscrub_pause; + /* total error scrub paused time in milliseconds */ + uint64_t spa_scan_pass_errorscrub_spent_paused; /* * We are in the middle of a resilver, and another resilver * is needed once this one completes. This is set iff any @@ -316,6 +343,9 @@ struct spa { spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ + vdev_raidz_expand_t *spa_raidz_expand; + zthr_t *spa_raidz_expand_zthr; + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; @@ -349,6 +379,7 @@ struct spa { kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ + avl_tree_t spa_errlist_healed; /* list of healed blocks */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ @@ -379,6 +410,7 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ @@ -415,7 +447,9 @@ struct spa { hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ + taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */ taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ + taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ @@ -439,15 +473,13 @@ struct spa { */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ - - taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ }; extern char *spa_config_path; extern const char *zfs_deadman_failmode; -extern int spa_slop_shift; +extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, - task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, zio_t *zio); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); @@ -459,6 +491,8 @@ extern int param_set_deadman_failmode_common(const char *val); extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); +extern const char *zfs_active_allocator; +extern int param_set_active_allocator_common(const char *val); #ifdef __cplusplus } diff --git a/include/sys/spa_log_spacemap.h b/include/sys/spa_log_spacemap.h index 72229df6cd16..f59e69917833 100644 --- a/include/sys/spa_log_spacemap.h +++ b/include/sys/spa_log_spacemap.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/space_map.h b/include/sys/space_map.h index cb81e710bd1e..14c5beccee55 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h index ca9d41dc1388..b7a846aec624 100644 --- a/include/sys/space_reftree.h +++ b/include/sys/space_reftree.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/sysevent.h b/include/sys/sysevent.h index 6510297d601f..f8ae17497366 100644 --- a/include/sys/sysevent.h +++ b/include/sys/sysevent.h @@ -7,7 +7,7 @@ * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h index 1117538d822d..0783d0073162 100644 --- a/include/sys/sysevent/dev.h +++ b/include/sys/sysevent/dev.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -244,6 +244,9 @@ extern "C" { #define DEV_PATH "path" #define DEV_IS_PART "is_slice" #define DEV_SIZE "dev_size" + +/* Size of the whole parent block device (if dev is a partition) */ +#define DEV_PARENT_SIZE "dev_parent_size" #endif /* __linux__ */ #define EV_V1 1 diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h index 2067b355afb4..a21085257967 100644 --- a/include/sys/sysevent/eventdefs.h +++ b/include/sys/sysevent/eventdefs.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -123,6 +123,11 @@ extern "C" { #define ESC_ZFS_TRIM_CANCEL "trim_cancel" #define ESC_ZFS_TRIM_RESUME "trim_resume" #define ESC_ZFS_TRIM_SUSPEND "trim_suspend" +#define ESC_ZFS_ERRORSCRUB_START "errorscrub_start" +#define ESC_ZFS_ERRORSCRUB_FINISH "errorscrub_finish" +#define ESC_ZFS_ERRORSCRUB_ABORT "errorscrub_abort" +#define ESC_ZFS_ERRORSCRUB_RESUME "errorscrub_resume" +#define ESC_ZFS_ERRORSCRUB_PAUSED "errorscrub_paused" /* * datalink subclass definitions. diff --git a/include/sys/txg.h b/include/sys/txg.h index f38f0006c040..46945210cdb5 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -138,7 +138,7 @@ extern void *txg_list_head(txg_list_t *tl, uint64_t txg); extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); /* Global tuning */ -extern int zfs_txg_timeout; +extern uint_t zfs_txg_timeout; #ifdef ZFS_DEBUG diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h index 047d51b94c66..8ab7969b25be 100644 --- a/include/sys/txg_impl.h +++ b/include/sys/txg_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -73,8 +73,7 @@ struct tx_cpu { kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ - char tc_pad[8]; /* pad to fill 3 cache lines */ -}; +} ____cacheline_aligned; /* * The tx_state structure maintains the state information about the different diff --git a/include/sys/u8_textprep.h b/include/sys/u8_textprep.h index 09ab13af268c..e82037de4fe4 100644 --- a/include/sys/u8_textprep.h +++ b/include/sys/u8_textprep.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/u8_textprep_data.h b/include/sys/u8_textprep_data.h index 03f71f26c9e1..2a97966ee56e 100644 --- a/include/sys/u8_textprep_data.h +++ b/include/sys/u8_textprep_data.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/uberblock.h b/include/sys/uberblock.h index 044e438387c0..ff3a8c81232a 100644 --- a/include/sys/uberblock.h +++ b/include/sys/uberblock.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 91699e65131a..e480a4bac0b9 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -50,20 +50,20 @@ extern "C" { #define MMP_SEQ_VALID_BIT 0x02 #define MMP_FAIL_INT_VALID_BIT 0x04 -#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ - ubp->ub_mmp_magic == MMP_MAGIC) -#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \ + (ubp)->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_INTERVAL_VALID_BIT)) -#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_SEQ_VALID_BIT)) -#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_FAIL_INT_VALID_BIT)) -#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ +#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ >> 8) -#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ +#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ >> 32) -#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ +#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \ >> 48) #define MMP_INTERVAL_SET(write) \ @@ -75,6 +75,39 @@ extern "C" { #define MMP_FAIL_INT_SET(fail) \ (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) +/* + * RAIDZ expansion reflow information. + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |Scratch | Reflow | + * | State | Offset | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ +typedef enum raidz_reflow_scratch_state { + RRSS_SCRATCH_NOT_IN_USE = 0, + RRSS_SCRATCH_VALID, + RRSS_SCRATCH_INVALID_SYNCED, + RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, + RRSS_SCRATCH_INVALID_SYNCED_REFLOW +} raidz_reflow_scratch_state_t; + +#define RRSS_GET_OFFSET(ub) \ + BF64_GET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0) +#define RRSS_SET_OFFSET(ub, x) \ + BF64_SET_SB((ub)->ub_raidz_reflow_info, 0, 55, SPA_MINBLOCKSHIFT, 0, x) + +#define RRSS_GET_STATE(ub) \ + BF64_GET((ub)->ub_raidz_reflow_info, 55, 9) +#define RRSS_SET_STATE(ub, x) \ + BF64_SET((ub)->ub_raidz_reflow_info, 55, 9, x) + +#define RAIDZ_REFLOW_SET(ub, state, offset) do { \ + (ub)->ub_raidz_reflow_info = 0; \ + RRSS_SET_OFFSET(ub, offset); \ + RRSS_SET_STATE(ub, state); \ +} while (0) + struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ uint64_t ub_version; /* SPA_VERSION */ @@ -132,10 +165,12 @@ struct uberblock { * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], * the value of the field is used to determine which ZIL blocks have * been allocated according to the ms_sm when we are rewinding to a - * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * checkpoint. Specifically, if logical birth > ub_checkpoint_txg,then * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. */ uint64_t ub_checkpoint_txg; + + uint64_t ub_raidz_reflow_info; }; #ifdef __cplusplus diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index cde3ef40485b..aa34edda5f6a 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/unique.h b/include/sys/unique.h index d4ba32e5c642..bc7944657521 100644 --- a/include/sys/unique.h +++ b/include/sys/unique.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/uuid.h b/include/sys/uuid.h index eab4622a6d9a..19f1baa4432e 100644 --- a/include/sys/uuid.h +++ b/include/sys/uuid.h @@ -7,7 +7,7 @@ * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 8a526d0bf511..38f62b07dc59 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -132,15 +132,19 @@ extern void vdev_space_update(vdev_t *vd, extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, + uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); /* - * Return the amount of space allocated for a gang block header. + * Return the amount of space allocated for a gang block header. Note that + * since the physical birth txg is not provided, this must be constant for + * a given vdev. (e.g. raidz expansion can't change this) */ static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); + return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); @@ -148,6 +152,7 @@ extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); +extern int vdev_remove_wanted(spa_t *spa, uint64_t guid); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern boolean_t vdev_is_dead(vdev_t *vd); @@ -157,20 +162,15 @@ extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); -extern void vdev_cache_init(vdev_t *vd); -extern void vdev_cache_fini(vdev_t *vd); -extern boolean_t vdev_cache_read(zio_t *zio); -extern void vdev_cache_write(zio_t *zio); -extern void vdev_cache_purge(vdev_t *vd); - extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); -extern int vdev_queue_length(vdev_t *vd); +extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); +extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); @@ -185,11 +185,12 @@ extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, VDEV_CONFIG_L2CACHE = 1 << 1, - VDEV_CONFIG_REMOVING = 1 << 2, - VDEV_CONFIG_MOS = 1 << 3, - VDEV_CONFIG_MISSING = 1 << 4 + VDEV_CONFIG_MOS = 1 << 2, + VDEV_CONFIG_MISSING = 1 << 3 } vdev_config_flag_t; +extern void vdev_post_kobj_evt(vdev_t *vd); +extern void vdev_clear_kobj_evt(vdev_t *vd); extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags); @@ -207,6 +208,8 @@ extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); +extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int); +extern int vdev_check_boot_reserve(spa_t *, vdev_t *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index a7e19fbf0c4b..02c583777ebc 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h index dd334acbacf1..a204f1e3c74a 100644 --- a/include/sys/vdev_draid.h +++ b/include/sys/vdev_draid.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/vdev_file.h b/include/sys/vdev_file.h index 1514a44fcabb..fddecbfe1ab5 100644 --- a/include/sys/vdev_file.h +++ b/include/sys/vdev_file.h @@ -7,7 +7,7 @@ * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index db8fbdeb06df..57ff31e89eb9 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_VDEV_IMPL_H @@ -34,7 +35,6 @@ #include <sys/nvpair.h> #include <sys/space_map.h> #include <sys/vdev.h> -#include <sys/dkio.h> #include <sys/uberblock_impl.h> #include <sys/vdev_indirect_mapping.h> #include <sys/vdev_indirect_births.h> @@ -57,23 +57,22 @@ extern "C" { * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; -typedef struct vdev_cache vdev_cache_t; -typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; -extern int zfs_vdev_queue_depth_pct; -extern int zfs_vdev_def_queue_depth; -extern uint32_t zfs_vdev_async_write_max_active; +extern uint_t zfs_vdev_queue_depth_pct; +extern uint_t zfs_vdev_def_queue_depth; +extern uint_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); +typedef void vdev_kobj_post_evt_func_t(vdev_t *vd); typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); -typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); @@ -123,6 +122,7 @@ typedef const struct vdev_ops { vdev_config_generate_func_t *vdev_op_config_generate; vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; + vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -130,44 +130,27 @@ typedef const struct vdev_ops { /* * Virtual device properties */ -struct vdev_cache_entry { - struct abd *ve_abd; - uint64_t ve_offset; - clock_t ve_lastused; - avl_node_t ve_offset_node; - avl_node_t ve_lastused_node; - uint32_t ve_hits; - uint16_t ve_missed_update; - zio_t *ve_fill_io; -}; - -struct vdev_cache { - avl_tree_t vc_offset_tree; - avl_tree_t vc_lastused_tree; - kmutex_t vc_lock; -}; - -typedef struct vdev_queue_class { - uint32_t vqc_active; - - /* - * Sorted by offset or timestamp, depending on if the queue is - * LBA-ordered vs FIFO. - */ - avl_tree_t vqc_queued_tree; +typedef union vdev_queue_class { + struct { + ulong_t vqc_list_numnodes; + list_t vqc_list; + }; + avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; - avl_tree_t vq_active_tree; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; - avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ + uint32_t vq_cqueued; /* Classes with queued I/Os. */ + uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; + uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ + list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ @@ -275,6 +258,7 @@ struct vdev { kthread_t *vdev_open_thread; /* thread opening children */ kthread_t *vdev_validate_thread; /* thread validating children */ uint64_t vdev_crtxg; /* txg when top-level was added */ + uint64_t vdev_root_zap; /* * Top-level vdev state. @@ -285,18 +269,19 @@ struct vdev { metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ - uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ + uint64_t vdev_failfast; /* device failfast setting */ + boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ @@ -326,6 +311,7 @@ struct vdev { list_node_t vdev_trim_node; kmutex_t vdev_autotrim_lock; kcondvar_t vdev_autotrim_cv; + kcondvar_t vdev_autotrim_kick_cv; kthread_t *vdev_autotrim_thread; /* Protects vdev_trim_thread and vdev_trim_state. */ kmutex_t vdev_trim_lock; @@ -436,8 +422,9 @@ struct vdev { boolean_t vdev_isl2cache; /* was a l2cache device */ boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ + boolean_t vdev_kobj_flag; /* kobj event record */ + boolean_t vdev_attaching; /* vdev attach ashift handling */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ - vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ @@ -465,6 +452,16 @@ struct vdev { zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_checksum_rl; + + /* + * Vdev properties for tuning ZED or zfsd + */ + uint64_t vdev_checksum_n; + uint64_t vdev_checksum_t; + uint64_t vdev_io_n; + uint64_t vdev_io_t; + uint64_t vdev_slow_io_n; + uint64_t vdev_slow_io_t; }; #define VDEV_PAD_SIZE (8 << 10) @@ -542,6 +539,7 @@ typedef struct vdev_label { /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. + * On RAIDZ, this space is overwritten during RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ @@ -614,7 +612,7 @@ extern vdev_ops_t vdev_indirect_ops; */ extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs); -extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); @@ -641,12 +639,13 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); +uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); /* * Vdev ashift optimization tunables */ -extern uint64_t zfs_vdev_min_auto_ashift; -extern uint64_t zfs_vdev_max_auto_ashift; +extern uint_t zfs_vdev_min_auto_ashift; +extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h index 81d39ebebcb2..78702b7325a0 100644 --- a/include/sys/vdev_initialize.h +++ b/include/sys/vdev_initialize.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -33,6 +33,7 @@ extern "C" { #endif extern void vdev_initialize(vdev_t *vd); +extern void vdev_uninitialize(vdev_t *vd); extern void vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, list_t *vd_list); extern void vdev_initialize_stop_all(vdev_t *vd, diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index c7cf0af6d945..a34bc00ca4df 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -26,6 +26,7 @@ #define _SYS_VDEV_RAIDZ_H #include <sys/types.h> +#include <sys/zfs_rlock.h> #ifdef __cplusplus extern "C" { @@ -35,6 +36,8 @@ struct zio; struct raidz_col; struct raidz_row; struct raidz_map; +struct vdev_raidz; +struct uberblock; #if !defined(_KERNEL) struct kernel_param {}; #endif @@ -44,13 +47,19 @@ struct kernel_param {}; */ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); +struct raidz_map *vdev_raidz_map_alloc_expanded(struct zio *, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, boolean_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_free(struct vdev_raidz *); void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_child_done(zio_t *); void vdev_raidz_io_done(zio_t *); void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); +struct raidz_row *vdev_raidz_row_alloc(int); +void vdev_raidz_reflow_copy_scratch(spa_t *); +void raidz_dtl_reassessed(vdev_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; @@ -65,11 +74,101 @@ int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz_expand { + uint64_t vre_vdev_id; + + kmutex_t vre_lock; + kcondvar_t vre_cv; + + /* + * How much i/o is outstanding (issued and not completed). + */ + uint64_t vre_outstanding_bytes; + + /* + * Next offset to issue i/o for. + */ + uint64_t vre_offset; + + /* + * Lowest offset of a failed expansion i/o. The expansion will retry + * from here. Once the expansion thread notices the failure and exits, + * vre_failed_offset is reset back to UINT64_MAX, and + * vre_waiting_for_resilver will be set. + */ + uint64_t vre_failed_offset; + boolean_t vre_waiting_for_resilver; + + /* + * Offset that is completing each txg + */ + uint64_t vre_offset_pertxg[TXG_SIZE]; + + /* + * Bytes copied in each txg. + */ + uint64_t vre_bytes_copied_pertxg[TXG_SIZE]; + + /* + * The rangelock prevents normal read/write zio's from happening while + * there are expansion (reflow) i/os in progress to the same offsets. + */ + zfs_rangelock_t vre_rangelock; + + /* + * These fields are stored on-disk in the vdev_top_zap: + */ + dsl_scan_state_t vre_state; + uint64_t vre_start_time; + uint64_t vre_end_time; + uint64_t vre_bytes_copied; +} vdev_raidz_expand_t; + typedef struct vdev_raidz { - int vd_logical_width; + /* + * Number of child vdevs when this raidz vdev was created (i.e. before + * any raidz expansions). + */ + int vd_original_width; + + /* + * The current number of child vdevs, which may be more than the + * original width if an expansion is in progress or has completed. + */ + int vd_physical_width; + int vd_nparity; + + /* + * Tree of reflow_node_t's. The lock protects the avl tree only. + * The reflow_node_t's describe completed expansions, and are used + * to determine the logical width given a block's birth time. + */ + avl_tree_t vd_expand_txgs; + kmutex_t vd_expand_lock; + + /* + * If this vdev is being expanded, spa_raidz_expand is set to this + */ + vdev_raidz_expand_t vn_vre; } vdev_raidz_t; +extern int vdev_raidz_attach_check(vdev_t *); +extern void vdev_raidz_attach_sync(void *, dmu_tx_t *); +extern void spa_start_raidz_expansion_thread(spa_t *); +extern int spa_raidz_expand_get_stats(spa_t *, pool_raidz_expand_stat_t *); +extern int vdev_raidz_load(vdev_t *); + +/* RAIDZ scratch area pause points (for testing) */ +#define RAIDZ_EXPAND_PAUSE_NONE 0 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1 1 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2 2 +#define RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3 3 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_VALID 4 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED 5 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1 6 +#define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2 7 + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 890e725e18d8..45cb5864a22b 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -30,6 +30,8 @@ #include <sys/kstat.h> #include <sys/abd.h> #include <sys/vdev_impl.h> +#include <sys/abd_impl.h> +#include <sys/zfs_rlock.h> #ifdef __cplusplus extern "C" { @@ -102,35 +104,39 @@ typedef struct raidz_impl_ops { char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ } raidz_impl_ops_t; + typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ + int rc_devidx; /* child device index for I/O */ + uint32_t rc_size; /* I/O size */ uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ abd_t rc_abdstruct; /* rc_abd probably points here */ abd_t *rc_abd; /* I/O data */ abd_t *rc_orig_data; /* pre-reconstruction */ int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ - uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ - uint8_t rc_force_repair; /* Write good data to this column */ - uint8_t rc_allow_repair; /* Allow repair I/O to this column */ + uint8_t rc_tried:1; /* Did we attempt this I/O column? */ + uint8_t rc_skipped:1; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ + uint8_t rc_force_repair:1; /* Write good data to this column */ + uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + int rc_shadow_devidx; /* for double write during expansion */ + int rc_shadow_error; /* for double write during expansion */ + uint64_t rc_shadow_offset; /* for double write during expansion */ } raidz_col_t; typedef struct raidz_row { - uint64_t rr_cols; /* Regular column count */ - uint64_t rr_scols; /* Count including skipped columns */ - uint64_t rr_bigcols; /* Remainder data column count */ - uint64_t rr_missingdata; /* Count of missing data devices */ - uint64_t rr_missingparity; /* Count of missing parity devices */ - uint64_t rr_firstdatacol; /* First data column/parity count */ + int rr_cols; /* Regular column count */ + int rr_scols; /* Count including skipped columns */ + int rr_bigcols; /* Remainder data column count */ + int rr_missingdata; /* Count of missing data devices */ + int rr_missingparity; /* Count of missing parity devices */ + int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ #endif - raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ + raidz_col_t rr_col[]; /* Flexible array of I/O columns */ } raidz_row_t; typedef struct raidz_map { @@ -138,10 +144,25 @@ typedef struct raidz_map { int rm_nrows; /* Regular row count */ int rm_nskip; /* RAIDZ sectors skipped for padding */ int rm_skipstart; /* Column index of padding start */ + int rm_original_width; /* pre-expansion width of raidz vdev */ + int rm_nphys_cols; /* num entries in rm_phys_col[] */ + zfs_locked_range_t *rm_lr; const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_row_t *rm_row[0]; /* flexible array of rows */ + raidz_col_t *rm_phys_col; /* if non-NULL, read i/o aggregation */ + raidz_row_t *rm_row[]; /* flexible array of rows */ } raidz_map_t; +/* + * Nodes in vdev_raidz_t:vd_expand_txgs. + * Blocks with physical birth time of re_txg or later have the specified + * logical width (until the next node). + */ +typedef struct reflow_node { + uint64_t re_txg; + uint64_t re_logical_width; + avl_node_t re_link; +} reflow_node_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) @@ -321,7 +342,7 @@ vdev_raidz_exp2(const uint8_t a, const unsigned exp) * Galois Field operations. * * gf_exp2 - computes 2 raised to the given power - * gf_exp2 - computes 4 raised to the given power + * gf_exp4 - computes 4 raised to the given power * gf_mul - multiplication * gf_div - division * gf_inv - multiplicative inverse diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index b59fbe153903..55ec6c570316 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -79,6 +79,7 @@ typedef struct vdev_rebuild { uint64_t vr_pass_start_time; uint64_t vr_pass_bytes_scanned; uint64_t vr_pass_bytes_issued; + uint64_t vr_pass_bytes_skipped; /* On-disk state updated by vdev_rebuild_zap_update_sync() */ vdev_rebuild_phys_t vr_rebuild_phys; diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index e3bab0658d62..70b743f4ec6b 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -87,7 +87,7 @@ extern int spa_vdev_remove_cancel(spa_t *); extern void spa_vdev_removal_destroy(spa_vdev_removal_t *); extern uint64_t spa_remove_max_segment(spa_t *); -extern int vdev_removal_max_span; +extern uint_t vdev_removal_max_span; #ifdef __cplusplus } diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h index 16f4be2a41f8..7a94d4af098f 100644 --- a/include/sys/vdev_trim.h +++ b/include/sys/vdev_trim.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -41,6 +41,7 @@ extern void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state); extern void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list); extern void vdev_trim_restart(vdev_t *vd); extern void vdev_autotrim(spa_t *spa); +extern void vdev_autotrim_kick(spa_t *spa); extern void vdev_autotrim_stop_all(spa_t *spa); extern void vdev_autotrim_stop_wait(vdev_t *vd); extern void vdev_autotrim_restart(spa_t *spa); diff --git a/include/sys/xvattr.h b/include/sys/xvattr.h index 277c4694069d..a7994db894b9 100644 --- a/include/sys/xvattr.h +++ b/include/sys/xvattr.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zap.h b/include/sys/zap.h index dc2f661fb065..96ddcc324b65 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key, int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an @@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified @@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); +int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 4549a9bd1177..2959aa9b2ca4 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -66,10 +66,9 @@ typedef struct mzap_phys { } mzap_phys_t; typedef struct mzap_ent { - avl_node_t mze_node; - int mze_chunkid; - uint64_t mze_hash; - uint32_t mze_cd; /* copy from mze_phys->mze_cd */ + uint32_t mze_hash; + uint16_t mze_cd; /* copy from mze_phys->mze_cd */ + uint16_t mze_chunkid; } mzap_ent_t; #define MZE_PHYS(zap, mze) \ @@ -146,6 +145,7 @@ typedef struct zap { dmu_buf_user_t zap_dbu; objset_t *zap_objset; uint64_t zap_object; + dnode_t *zap_dnode; struct dmu_buf *zap_dbuf; krwlock_t zap_rwlock; boolean_t zap_ismicro; @@ -164,7 +164,7 @@ typedef struct zap { int16_t zap_num_entries; int16_t zap_num_chunks; int16_t zap_alloc_next; - avl_tree_t zap_avl; + zfs_btree_t zap_tree; } zap_micro; } zap_u; } zap_t; @@ -203,7 +203,7 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, zap_t **zapp); void zap_unlockdir(zap_t *zap, const void *tag); void zap_evict_sync(void *dbu); -zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); +zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); uint32_t zap_maxcd(zap_t *zap); diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index a3da1036a5ee..e54456d3472b 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -47,7 +47,7 @@ struct zap_stats; * entries - header space (2*chunksize) */ #define ZAP_LEAF_NUMCHUNKS_BS(bs) \ - (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ + (((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ ZAP_LEAF_CHUNKSIZE - 2) #define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs))) @@ -80,7 +80,7 @@ struct zap_stats; * chunks per entry (3). */ #define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5) -#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs)) +#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs)) #define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs))) #define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs))) @@ -132,7 +132,7 @@ typedef struct zap_leaf_phys { * with the ZAP_LEAF_CHUNK() macro. */ - uint16_t l_hash[1]; + uint16_t l_hash[]; } zap_leaf_phys_t; typedef union zap_leaf_chunk { @@ -163,7 +163,7 @@ typedef struct zap_leaf { dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */ - int l_bs; /* block size shift */ + uint_t l_bs; /* block size shift */ dmu_buf_t *l_dbuf; } zap_leaf_t; @@ -243,7 +243,7 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, */ extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); -extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); +extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t len); extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, struct zap_stats *zs); diff --git a/include/sys/zcp.h b/include/sys/zcp.h index f0a78f9cb5c4..6301cc08e7ea 100644 --- a/include/sys/zcp.h +++ b/include/sys/zcp.h @@ -33,8 +33,8 @@ extern "C" { #define ZCP_RUN_INFO_KEY "runinfo" -extern unsigned long zfs_lua_max_instrlimit; -extern unsigned long zfs_lua_max_memlimit; +extern uint64_t zfs_lua_max_instrlimit; +extern uint64_t zfs_lua_max_memlimit; int zcp_argerror(lua_State *, int, const char *, ...); diff --git a/include/sys/zcp_iter.h b/include/sys/zcp_iter.h index 1d92d0c6d10c..fa6eeef25edd 100644 --- a/include/sys/zcp_iter.h +++ b/include/sys/zcp_iter.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h index 5abde149a615..bf9361374d33 100644 --- a/include/sys/zfeature.h +++ b/include/sys/zfeature.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_acl.h b/include/sys/zfs_acl.h index 98387a49adbe..e19288528849 100644 --- a/include/sys/zfs_acl.h +++ b/include/sys/zfs_acl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -206,7 +206,7 @@ struct zfsvfs; #ifdef _KERNEL int zfs_acl_ids_create(struct znode *, int, vattr_t *, - cred_t *, vsecattr_t *, zfs_acl_ids_t *); + cred_t *, vsecattr_t *, zfs_acl_ids_t *, zidmap_t *); void zfs_acl_ids_free(zfs_acl_ids_t *); boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *, uint64_t); int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); @@ -215,15 +215,16 @@ void zfs_acl_rele(void *); void zfs_oldace_byteswap(ace_t *, int); void zfs_ace_byteswap(void *, size_t, boolean_t); extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); -extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *, + zidmap_t *); int zfs_fastaccesschk_execute(struct znode *, cred_t *); -extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); -extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *, zidmap_t *); +extern int zfs_zaccess_unix(void *, int, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); -int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *, zidmap_t *); int zfs_zaccess_rename(struct znode *, struct znode *, - struct znode *, struct znode *, cred_t *cr); + struct znode *, struct znode *, cred_t *cr, zidmap_t *mnt_ns); void zfs_acl_free(zfs_acl_t *); int zfs_vsec_2_aclp(struct zfsvfs *, umode_t, vsecattr_t *, cred_t *, struct zfs_fuid_info **, zfs_acl_t **); diff --git a/include/sys/zfs_chksum.h b/include/sys/zfs_chksum.h index cfd07bd0ffe7..a0e1b35189bb 100644 --- a/include/sys/zfs_chksum.h +++ b/include/sys/zfs_chksum.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 50257f1d6fdd..8f264b50e995 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -50,6 +50,7 @@ extern "C" { #include <sys/kmem.h> #include <sys/kmem_cache.h> #include <sys/vmem.h> +#include <sys/misc.h> #include <sys/taskq.h> #include <sys/param.h> #include <sys/disp.h> @@ -150,10 +151,14 @@ extern "C" { extern void dprintf_setup(int *argc, char **argv); -extern void cmn_err(int, const char *, ...); -extern void vcmn_err(int, const char *, va_list); -extern __attribute__((noreturn)) void panic(const char *, ...); -extern __attribute__((noreturn)) void vpanic(const char *, va_list); +extern void cmn_err(int, const char *, ...) + __attribute__((format(printf, 2, 3))); +extern void vcmn_err(int, const char *, va_list) + __attribute__((format(printf, 2, 0))); +extern void panic(const char *, ...) + __attribute__((format(printf, 1, 2), noreturn)); +extern void vpanic(const char *, va_list) + __attribute__((format(printf, 1, 0), noreturn)); #define fm_panic panic @@ -219,14 +224,13 @@ typedef pthread_t kthread_t; #define TS_JOINABLE 0x00000004 #define curthread ((void *)(uintptr_t)pthread_self()) -#define kpreempt(x) yield() #define getcomm() "unknown" #define thread_create_named(name, stk, stksize, func, arg, len, \ pp, state, pri) \ - zk_thread_create(func, arg, stksize, state) + zk_thread_create(name, func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ - zk_thread_create(func, arg, stksize, state) + zk_thread_create(#func, func, arg, stksize, state) #define thread_exit() pthread_exit(NULL) #define thread_join(t) pthread_join((pthread_t)(t), NULL) @@ -242,15 +246,17 @@ extern struct proc p0; #define PS_NONE -1 -extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, - size_t stksize, int state); +extern kthread_t *zk_thread_create(const char *name, void (*func)(void *), + void *arg, size_t stksize, int state); #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) +#define KPREEMPT_SYNC (-1) + +#define kpreempt(x) sched_yield() #define kpreempt_disable() ((void)0) #define kpreempt_enable() ((void)0) -#define cond_resched() sched_yield() /* * Mutexes @@ -268,11 +274,13 @@ typedef struct kmutex { extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); +extern int mutex_enter_check_return(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) +#define mutex_enter_interruptible(mp) mutex_enter_check_return(mp) /* * RW locks */ @@ -488,6 +496,8 @@ extern taskq_t *system_taskq; extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t, + kthread_t ***); #define taskq_create_proc(a, b, c, d, e, p, f) \ (taskq_create(a, b, c, d, e, f)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ @@ -689,6 +699,11 @@ extern char *kmem_asprintf(const char *fmt, ...); #define kmem_strfree(str) kmem_free((str), strlen(str) + 1) #define kmem_strdup(s) strdup(s) +#ifndef __cplusplus +extern int kmem_scnprintf(char *restrict str, size_t size, + const char *restrict fmt, ...); +#endif + /* * Hostname information */ diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 7b103510dd07..8d94557a5882 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -57,6 +57,8 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_TRIM (1 << 11) #define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) #define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) +#define ZFS_DEBUG_BRT (1 << 14) +#define ZFS_DEBUG_RAIDZ_RECONSTRUCT (1 << 15) extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); diff --git a/include/sys/zfs_delay.h b/include/sys/zfs_delay.h index 40e617dba961..56ac1f3c439b 100644 --- a/include/sys/zfs_delay.h +++ b/include/sys/zfs_delay.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_file.h b/include/sys/zfs_file.h index 02cd1a6f041a..e944165adc40 100644 --- a/include/sys/zfs_file.h +++ b/include/sys/zfs_file.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_fuid.h b/include/sys/zfs_fuid.h index 1975e57cf62b..d6b2942d1bec 100644 --- a/include/sys/zfs_fuid.h +++ b/include/sys/zfs_fuid.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_impl.h b/include/sys/zfs_impl.h new file mode 100644 index 000000000000..df4899f132b8 --- /dev/null +++ b/include/sys/zfs_impl.h @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de> + */ + +#ifndef _SYS_ZFS_IMPL_H +#define _SYS_ZFS_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* generic implementation backends */ +typedef struct +{ + /* algorithm name */ + const char *name; + + /* get number of supported implementations */ + uint32_t (*getcnt)(void); + + /* get id of selected implementation */ + uint32_t (*getid)(void); + + /* get name of selected implementation */ + const char *(*getname)(void); + + /* setup id as fastest implementation */ + void (*set_fastest)(uint32_t id); + + /* set implementation by id */ + void (*setid)(uint32_t id); + + /* set implementation by name */ + int (*setname)(const char *val); +} zfs_impl_t; + +/* return some set of function pointer */ +extern const zfs_impl_t *zfs_impl_get_ops(const char *algo); + +extern const zfs_impl_t zfs_blake3_ops; +extern const zfs_impl_t zfs_sha256_ops; +extern const zfs_impl_t zfs_sha512_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IMPL_H */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 94522179676a..525d40759fdd 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. */ @@ -124,7 +124,13 @@ typedef enum drr_headertype { * default use of "zfs send" won't encounter the bug mentioned above. */ #define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) -#define DMU_BACKUP_FEATURE_BLAKE3 (1 << 28) +/* flag #28 is reserved for a Nutanix feature */ +/* + * flag #29 is the last unused bit. It is reserved to indicate a to-be-designed + * extension to the stream format which will accomodate more feature flags. + * If you need to add another feature flag, please reach out to the OpenZFS + * community, e.g., on GitHub or Slack. + */ /* * Mask of all supported backup features @@ -135,7 +141,7 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ - DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_BLAKE3) + DMU_BACKUP_FEATURE_ZSTD) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -448,6 +454,8 @@ typedef enum zinject_type { ZINJECT_PANIC, ZINJECT_DELAY_IO, ZINJECT_DECRYPT_FAULT, + ZINJECT_DELAY_IMPORT, + ZINJECT_DELAY_EXPORT, } zinject_type_t; typedef struct zfs_share { @@ -569,7 +577,6 @@ typedef struct zfsdev_state { extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which); extern int zfsdev_getminor(zfs_file_t *fp, minor_t *minorp); -extern uint_t zfs_fsyncer_key; extern uint_t zfs_allow_log_key; #endif /* _KERNEL */ diff --git a/include/sys/zfs_ioctl_impl.h b/include/sys/zfs_ioctl_impl.h index f9e4f6e6c4b2..cb852c5577fd 100644 --- a/include/sys/zfs_ioctl_impl.h +++ b/include/sys/zfs_ioctl_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -24,7 +24,7 @@ #define _ZFS_IOCTL_IMPL_H_ extern kmutex_t zfsdev_state_lock; -extern unsigned long zfs_max_nvlist_src_size; +extern uint64_t zfs_max_nvlist_src_size; typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); diff --git a/include/sys/zfs_onexit.h b/include/sys/zfs_onexit.h index fd3030e3ac2d..91f49d4cc5a3 100644 --- a/include/sys/zfs_onexit.h +++ b/include/sys/zfs_onexit.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -54,7 +54,7 @@ extern void zfs_onexit_destroy(zfs_onexit_t *zo); extern zfs_file_t *zfs_onexit_fd_hold(int fd, minor_t *minorp); extern void zfs_onexit_fd_rele(zfs_file_t *); extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - uint64_t *action_handle); + uintptr_t *action_handle); #ifdef __cplusplus } diff --git a/include/sys/zfs_project.h b/include/sys/zfs_project.h index 81a238905225..8a46e5e068db 100644 --- a/include/sys/zfs_project.h +++ b/include/sys/zfs_project.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_quota.h b/include/sys/zfs_quota.h index b215b8dd0013..4567cc651afb 100644 --- a/include/sys/zfs_quota.h +++ b/include/sys/zfs_quota.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_racct.h b/include/sys/zfs_racct.h index cfcdd336ea42..0e8bd04c1a13 100644 --- a/include/sys/zfs_racct.h +++ b/include/sys/zfs_racct.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h index 2f59ebb32b07..77965a0aa580 100644 --- a/include/sys/zfs_refcount.h +++ b/include/sys/zfs_refcount.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -27,6 +27,7 @@ #define _SYS_ZFS_REFCOUNT_H #include <sys/inttypes.h> +#include <sys/avl.h> #include <sys/list.h> #include <sys/zfs_context.h> @@ -43,19 +44,22 @@ extern "C" { #ifdef ZFS_DEBUG typedef struct reference { - list_node_t ref_link; + union { + avl_node_t a; + list_node_t l; + } ref_link; const void *ref_holder; uint64_t ref_number; - uint8_t *ref_removed; + boolean_t ref_search; } reference_t; typedef struct refcount { + uint64_t rc_count; kmutex_t rc_mtx; - boolean_t rc_tracked; - list_t rc_list; + avl_tree_t rc_tree; list_t rc_removed; - uint64_t rc_count; - uint64_t rc_removed_count; + uint_t rc_removed_count; + boolean_t rc_tracked; } zfs_refcount_t; /* @@ -73,13 +77,15 @@ int64_t zfs_refcount_count(zfs_refcount_t *); int64_t zfs_refcount_add(zfs_refcount_t *, const void *); int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); /* - * Note that (add|remove)_many add/remove one reference with "number" N, - * _not_ make N references with "number" 1, which is what vanilla - * zfs_refcount_(add|remove) would do if called N times. + * Note that (add|remove)_many adds/removes one reference with "number" N, + * _not_ N references with "number" 1, which is what (add|remove)_few does, + * or what vanilla zfs_refcount_(add|remove) called N times would do. * * Attempting to remove a reference with number N when none exists is a * panic on debug kernels with reference_tracking enabled. */ +void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *); +void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); @@ -108,6 +114,10 @@ typedef struct refcount { #define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) +#define zfs_refcount_add_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, number) +#define zfs_refcount_remove_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, -number) #define zfs_refcount_add_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, number) #define zfs_refcount_remove_many(rc, number, holder) \ diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h index 2302abb37337..5e5d6d68d6c5 100644 --- a/include/sys/zfs_rlock.h +++ b/include/sys/zfs_rlock.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h index 6b0336997c20..1b4b8abf0244 100644 --- a/include/sys/zfs_sa.h +++ b/include/sys/zfs_sa.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_stat.h b/include/sys/zfs_stat.h index 465aefaa2063..1589f945cbd7 100644 --- a/include/sys/zfs_stat.h +++ b/include/sys/zfs_stat.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_sysfs.h b/include/sys/zfs_sysfs.h index d1cb2ef4321c..6fe9b7a9cd2c 100644 --- a/include/sys/zfs_sysfs.h +++ b/include/sys/zfs_sysfs.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h index a438c86f0a0c..19ae7b77b459 100644 --- a/include/sys/zfs_vfsops.h +++ b/include/sys/zfs_vfsops.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 18259f0dc9b5..e60b99bed192 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -24,13 +24,20 @@ #ifndef _SYS_FS_ZFS_VNOPS_H #define _SYS_FS_ZFS_VNOPS_H + #include <sys/zfs_vnops_os.h> +extern int zfs_bclone_enabled; + extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_holey(znode_t *, ulong_t, loff_t *); extern int zfs_access(znode_t *, int, int, cred_t *); +extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *, + uint64_t *, cred_t *); +extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t, + const blkptr_t *, size_t); extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 098cf9dbc16f..d71144807f47 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -158,6 +158,7 @@ extern "C" { #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); #ifdef _KERNEL #include <sys/zfs_znode_impl.h> @@ -188,9 +189,7 @@ typedef struct znode { boolean_t z_atime_dirty; /* atime needs to be synced */ boolean_t z_zn_prefetch; /* Prefetch znodes? */ boolean_t z_is_sa; /* are we native sa? */ - boolean_t z_is_mapped; /* are we mmap'ed */ boolean_t z_is_ctldir; /* are we .zfs entry */ - boolean_t z_is_stale; /* are we stale due to rollback? */ boolean_t z_suspended; /* extra ref from a suspend? */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ @@ -218,11 +217,34 @@ typedef struct znode { ZNODE_OS_FIELDS; } znode_t; +/* Verifies the znode is valid. */ +static inline int +zfs_verify_zp(znode_t *zp) +{ + if (unlikely(zp->z_sa_hdl == NULL)) + return (SET_ERROR(EIO)); + return (0); +} + +/* zfs_enter and zfs_verify_zp together */ +static inline int +zfs_enter_verify_zp(zfsvfs_t *zfsvfs, znode_t *zp, const char *tag) +{ + int error; + if ((error = zfs_enter(zfsvfs, tag)) != 0) + return (error); + if ((error = zfs_verify_zp(zp)) != 0) { + zfs_exit(zfsvfs, tag); + return (error); + } + return (0); +} + typedef struct znode_hold { uint64_t zh_obj; /* object id */ - kmutex_t zh_lock; /* lock serializing object access */ avl_node_t zh_node; /* avl tree linkage */ - zfs_refcount_t zh_refcount; /* active consumer reference count */ + kmutex_t zh_lock; /* lock serializing object access */ + int zh_refcount; /* active consumer reference count */ } znode_hold_t; static inline uint64_t @@ -250,6 +272,8 @@ extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); extern void zfs_znode_init(void); extern void zfs_znode_fini(void); extern int zfs_znode_hold_compare(const void *, const void *); +extern znode_hold_t *zfs_znode_hold_enter(zfsvfs_t *, uint64_t); +extern void zfs_znode_hold_exit(zfsvfs_t *, znode_hold_t *); extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); extern int zfs_rezget(znode_t *); extern void zfs_zinactive(znode_t *); @@ -257,7 +281,6 @@ extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_remove_op_tables(void); extern int zfs_create_op_tables(void); extern dev_t zfs_cmpldev(uint64_t); -extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); extern void zfs_znode_dmu_fini(znode_t *); @@ -277,8 +300,14 @@ extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, znode_t *szp); +extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, + const char *dname, znode_t *szp); +extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, + const char *dname, znode_t *szp, znode_t *wzp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag, + znode_t *zp, offset_t off, ssize_t len, boolean_t commit, zil_callback_t callback, void *callback_data); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); @@ -286,6 +315,9 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); +extern void zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t offset, uint64_t length, uint64_t blksz, + const blkptr_t *bps, size_t nbps); extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern void zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zil.h b/include/sys/zil.h index 2a7381f016ab..4747ecc067a9 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -33,6 +33,7 @@ #include <sys/zio.h> #include <sys/dmu.h> #include <sys/zio_crypt.h> +#include <sys/wmsum.h> #ifdef __cplusplus extern "C" { @@ -163,7 +164,10 @@ typedef enum zil_create { #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ #define TX_WRITE2 20 /* dmu_sync EALREADY write */ #define TX_SETSAXATTR 21 /* Set sa xattrs on file */ -#define TX_MAX_TYPE 22 /* Max transaction type */ +#define TX_RENAME_EXCHANGE 22 /* Atomic swap via renameat2 */ +#define TX_RENAME_WHITEOUT 23 /* Atomic whiteout via renameat2 */ +#define TX_CLONE_RANGE 24 /* Clone a file range */ +#define TX_MAX_TYPE 25 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -173,9 +177,9 @@ typedef enum zil_create { #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ /* - * Transactions for write, truncate, setattr, acl_v0, and acl can be logged - * out of order. For convenience in the code, all such records must have - * lr_foid at the same offset. + * Transactions for operations below can be logged out of order. + * For convenience in the code, all such records must have lr_foid + * at the same offset. */ #define TX_OOO(txtype) \ ((txtype) == TX_WRITE || \ @@ -184,7 +188,8 @@ typedef enum zil_create { (txtype) == TX_ACL_V0 || \ (txtype) == TX_ACL || \ (txtype) == TX_WRITE2 || \ - (txtype) == TX_SETSAXATTR) + (txtype) == TX_SETSAXATTR || \ + (txtype) == TX_CLONE_RANGE) /* * The number of dnode slots consumed by the object is stored in the 8 @@ -317,6 +322,19 @@ typedef struct { } lr_rename_t; typedef struct { + lr_rename_t lr_rename; /* common rename portion */ + /* members related to the whiteout file (based on lr_create_t) */ + uint64_t lr_wfoid; /* obj id of the new whiteout file */ + uint64_t lr_wmode; /* mode of object */ + uint64_t lr_wuid; /* uid of whiteout */ + uint64_t lr_wgid; /* gid of whiteout */ + uint64_t lr_wgen; /* generation (txg of creation) */ + uint64_t lr_wcrtime[2]; /* creation time */ + uint64_t lr_wrdev; /* always makedev(0, 0) */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_whiteout_t; + +typedef struct { lr_t lr_common; /* common portion of log record */ uint64_t lr_foid; /* file object to write */ uint64_t lr_offset; /* offset to write to */ @@ -371,6 +389,17 @@ typedef struct { /* lr_acl_bytes number of variable sized ace's follows */ } lr_acl_t; +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to clone into */ + uint64_t lr_offset; /* offset to clone to */ + uint64_t lr_length; /* length of the blocks to clone */ + uint64_t lr_blksz; /* file's block size */ + uint64_t lr_nbps; /* number of block pointers */ + blkptr_t lr_bps[]; + /* block pointers of the blocks to clone follows */ +} lr_clone_range_t; + /* * ZIL structure definitions, interface function prototype and globals. */ @@ -460,24 +489,54 @@ typedef struct zil_stats { * Transactions which have been allocated to the "normal" * (i.e. not slog) storage pool. Note that "bytes" accumulate * the actual log record sizes - which do not include the actual - * data in case of indirect writes. + * data in case of indirect writes. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_normal_count; kstat_named_t zil_itx_metaslab_normal_bytes; + kstat_named_t zil_itx_metaslab_normal_write; + kstat_named_t zil_itx_metaslab_normal_alloc; /* * Transactions which have been allocated to the "slog" storage pool. * If there are no separate log devices, this is the same as the - * "normal" pool. + * "normal" pool. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_slog_count; kstat_named_t zil_itx_metaslab_slog_bytes; -} zil_stats_t; - -#define ZIL_STAT_INCR(stat, val) \ - atomic_add_64(&zil_stats.stat.value.ui64, (val)); -#define ZIL_STAT_BUMP(stat) \ - ZIL_STAT_INCR(stat, 1); + kstat_named_t zil_itx_metaslab_slog_write; + kstat_named_t zil_itx_metaslab_slog_alloc; +} zil_kstat_values_t; + +typedef struct zil_sums { + wmsum_t zil_commit_count; + wmsum_t zil_commit_writer_count; + wmsum_t zil_itx_count; + wmsum_t zil_itx_indirect_count; + wmsum_t zil_itx_indirect_bytes; + wmsum_t zil_itx_copied_count; + wmsum_t zil_itx_copied_bytes; + wmsum_t zil_itx_needcopy_count; + wmsum_t zil_itx_needcopy_bytes; + wmsum_t zil_itx_metaslab_normal_count; + wmsum_t zil_itx_metaslab_normal_bytes; + wmsum_t zil_itx_metaslab_normal_write; + wmsum_t zil_itx_metaslab_normal_alloc; + wmsum_t zil_itx_metaslab_slog_count; + wmsum_t zil_itx_metaslab_slog_bytes; + wmsum_t zil_itx_metaslab_slog_write; + wmsum_t zil_itx_metaslab_slog_alloc; +} zil_sums_t; + +#define ZIL_STAT_INCR(zil, stat, val) \ + do { \ + int64_t tmpval = (val); \ + wmsum_add(&(zil_sums_global.stat), tmpval); \ + if ((zil)->zl_sums) \ + wmsum_add(&((zil)->zl_sums->stat), tmpval); \ + } while (0) + +#define ZIL_STAT_BUMP(zil, stat) \ + ZIL_STAT_INCR(zil, stat, 1); typedef int zil_parse_blk_func_t(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t txg); @@ -497,13 +556,14 @@ extern void zil_fini(void); extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); extern void zil_free(zilog_t *zilog); -extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); +extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data, + zil_sums_t *zil_sums); extern void zil_close(zilog_t *zilog); -extern void zil_replay(objset_t *os, void *arg, +extern boolean_t zil_replay(objset_t *os, void *arg, zil_replay_func_t *const replay_func[TX_MAX_TYPE]); extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); -extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); @@ -535,7 +595,12 @@ extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern uint64_t zil_max_copied_data(zilog_t *zilog); -extern uint64_t zil_max_log_data(zilog_t *zilog); +extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize); + +extern void zil_sums_init(zil_sums_t *zs); +extern void zil_sums_fini(zil_sums_t *zs); +extern void zil_kstat_values_update(zil_kstat_values_t *zs, + zil_sums_t *zil_sums); extern int zil_replay_disable; diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 8409ce864e90..9a34bafc1c77 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -38,14 +38,22 @@ extern "C" { /* * Possible states for a given lwb structure. * - * An lwb will start out in the "closed" state, and then transition to - * the "opened" state via a call to zil_lwb_write_open(). When - * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" - * must be held. + * An lwb will start out in the "new" state, and transition to the "opened" + * state via a call to zil_lwb_write_open() on first itx assignment. When + * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be + * held. * - * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must - * be held when making this transition. + * After the lwb is "opened", it can be assigned number of itxs and transition + * into the "closed" state via zil_lwb_write_close() when full or on timeout. + * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock" + * must be held. New lwb allocation also takes "zl_lock" to protect the list. + * + * After the lwb is "closed", it can transition into the "ready" state via + * zil_lwb_write_issue(). "zl_lock" must be held when making this transition. + * Since it is done by the same thread, "zl_issuer_lock" is not needed. + * + * When lwb in "ready" state receives its block pointer, it can transition to + * "issued". "zl_lock" must be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write * done" state via zil_lwb_write_done(); and then into the "flush done" @@ -62,17 +70,20 @@ extern "C" { * * Additionally, correctness when reading an lwb's state is often * achieved by exploiting the fact that these state transitions occur in - * this specific order; i.e. "closed" to "opened" to "issued" to "done". + * this specific order; i.e. "new" to "opened" to "closed" to "ready" to + * "issued" to "write_done" and finally "flush_done". * - * Thus, if an lwb is in the "closed" or "opened" state, holding the + * Thus, if an lwb is in the "new" or "opened" state, holding the * "zl_issuer_lock" will prevent a concurrent thread from transitioning - * that lwb to the "issued" state. Likewise, if an lwb is already in the - * "issued" state, holding the "zl_lock" will prevent a concurrent - * thread from transitioning that lwb to the "write done" state. + * that lwb to the "closed" state. Likewise, if an lwb is already in the + * "ready" state, holding the "zl_lock" will prevent a concurrent thread + * from transitioning that lwb to the "issued" state. */ typedef enum { - LWB_STATE_CLOSED, + LWB_STATE_NEW, LWB_STATE_OPENED, + LWB_STATE_CLOSED, + LWB_STATE_READY, LWB_STATE_ISSUED, LWB_STATE_WRITE_DONE, LWB_STATE_FLUSH_DONE, @@ -91,22 +102,28 @@ typedef enum { typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ - boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ + boolean_t lwb_slim; /* log block has slim format */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + int lwb_error; /* log block allocation error */ + int lwb_nmax; /* max bytes in the buffer */ int lwb_nused; /* # used bytes in buffer */ + int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ + zio_t *lwb_child_zio; /* parent zio for children */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ + hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ + uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ + list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ - hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ } lwb_t; /* @@ -164,7 +181,7 @@ typedef struct zil_vdev_node { avl_node_t zv_node; /* AVL tree linkage */ } zil_vdev_node_t; -#define ZIL_PREV_BLKS 16 +#define ZIL_BURSTS 8 /* * Stable storage intent log management structure. One per dataset. @@ -199,14 +216,18 @@ struct zilog { uint64_t zl_parse_lr_count; /* number of log records parsed */ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ list_t zl_itx_commit_list; /* itx list to be committed */ - uint64_t zl_cur_used; /* current commit log size used */ + uint64_t zl_cur_size; /* current burst full size */ + uint64_t zl_cur_left; /* current burst remaining size */ + uint64_t zl_cur_max; /* biggest record in current burst */ list_t zl_lwb_list; /* in-flight log write list */ avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ zil_header_t zl_old_header; /* debugging aid */ - uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_parallel; /* workload is multi-threaded */ uint_t zl_prev_rotor; /* rotor for zl_prev[] */ + uint_t zl_prev_opt[ZIL_BURSTS]; /* optimal block size */ + uint_t zl_prev_min[ZIL_BURSTS]; /* minimal first block size */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ @@ -222,6 +243,9 @@ struct zilog { * (see zil_max_copied_data()). */ uint64_t zl_max_block_size; + + /* Pointer for per dataset zil sums */ + zil_sums_t *zl_sums; }; typedef struct zil_bp_node { diff --git a/include/sys/zio.h b/include/sys/zio.h index 9bee7cc9b9fd..77c70b9b481c 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -22,12 +22,12 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Toomas Soome <tsoome@me.com> * Copyright (c) 2019, Allan Jude - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, 2024, Klara Inc. * Copyright (c) 2019-2020, Michael Niewöhner */ @@ -163,32 +163,36 @@ typedef enum zio_suspend_reason { ZIO_SUSPEND_MMP, } zio_suspend_reason_t; -enum zio_flag { +/* + * This was originally an enum type. However, those are 32-bit and there is no + * way to make a 64-bit enum type. Since we ran out of bits for flags, we were + * forced to upgrade it to a uint64_t. + */ +typedef uint64_t zio_flag_t; /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ - ZIO_FLAG_DONT_AGGREGATE = 1U << 0, - ZIO_FLAG_IO_REPAIR = 1U << 1, - ZIO_FLAG_SELF_HEAL = 1U << 2, - ZIO_FLAG_RESILVER = 1U << 3, - ZIO_FLAG_SCRUB = 1U << 4, - ZIO_FLAG_SCAN_THREAD = 1U << 5, - ZIO_FLAG_PHYSICAL = 1U << 6, +#define ZIO_FLAG_DONT_AGGREGATE (1ULL << 0) +#define ZIO_FLAG_IO_REPAIR (1ULL << 1) +#define ZIO_FLAG_SELF_HEAL (1ULL << 2) +#define ZIO_FLAG_RESILVER (1ULL << 3) +#define ZIO_FLAG_SCRUB (1ULL << 4) +#define ZIO_FLAG_SCAN_THREAD (1ULL << 5) +#define ZIO_FLAG_PHYSICAL (1ULL << 6) #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ - ZIO_FLAG_CANFAIL = 1U << 7, /* must be first for INHERIT */ - ZIO_FLAG_SPECULATIVE = 1U << 8, - ZIO_FLAG_CONFIG_WRITER = 1U << 9, - ZIO_FLAG_DONT_RETRY = 1U << 10, - ZIO_FLAG_DONT_CACHE = 1U << 11, - ZIO_FLAG_NODATA = 1U << 12, - ZIO_FLAG_INDUCE_DAMAGE = 1U << 13, - ZIO_FLAG_IO_ALLOCATING = 1U << 14, +#define ZIO_FLAG_CANFAIL (1ULL << 7) /* must be first for INHERIT */ +#define ZIO_FLAG_SPECULATIVE (1ULL << 8) +#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) +#define ZIO_FLAG_DONT_RETRY (1ULL << 10) +#define ZIO_FLAG_NODATA (1ULL << 12) +#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) +#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -196,30 +200,31 @@ enum zio_flag { /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1U << 15, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1U << 16, - ZIO_FLAG_TRYHARD = 1U << 17, - ZIO_FLAG_OPTIONAL = 1U << 18, +#define ZIO_FLAG_IO_RETRY (1ULL << 15) /* must be first for INHERIT */ +#define ZIO_FLAG_PROBE (1ULL << 16) +#define ZIO_FLAG_TRYHARD (1ULL << 17) +#define ZIO_FLAG_OPTIONAL (1ULL << 18) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1U << 19, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1U << 20, - ZIO_FLAG_IO_BYPASS = 1U << 21, - ZIO_FLAG_IO_REWRITE = 1U << 22, - ZIO_FLAG_RAW_COMPRESS = 1U << 23, - ZIO_FLAG_RAW_ENCRYPT = 1U << 24, - ZIO_FLAG_GANG_CHILD = 1U << 25, - ZIO_FLAG_DDT_CHILD = 1U << 26, - ZIO_FLAG_GODFATHER = 1U << 27, - ZIO_FLAG_NOPWRITE = 1U << 28, - ZIO_FLAG_REEXECUTED = 1U << 29, - ZIO_FLAG_DELEGATED = 1U << 30, - ZIO_FLAG_FASTWRITE = 1U << 31, -}; +#define ZIO_FLAG_DONT_QUEUE (1ULL << 19) /* must be first for INHERIT */ +#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 20) +#define ZIO_FLAG_IO_BYPASS (1ULL << 21) +#define ZIO_FLAG_IO_REWRITE (1ULL << 22) +#define ZIO_FLAG_RAW_COMPRESS (1ULL << 23) +#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 24) +#define ZIO_FLAG_GANG_CHILD (1ULL << 25) +#define ZIO_FLAG_DDT_CHILD (1ULL << 26) +#define ZIO_FLAG_GODFATHER (1ULL << 27) +#define ZIO_FLAG_NOPWRITE (1ULL << 28) +#define ZIO_FLAG_REEXECUTED (1ULL << 29) +#define ZIO_FLAG_DELEGATED (1ULL << 30) + +#define ZIO_ALLOCATOR_NONE (-1) +#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) @@ -299,12 +304,12 @@ struct zbookmark_phys { uint64_t zb_blkid; }; -typedef struct zbookmark_err_phys { +struct zbookmark_err_phys { uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; uint64_t zb_birth; -} zbookmark_err_phys_t; +}; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ @@ -338,12 +343,13 @@ typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; uint8_t zp_complevel; - dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; + dmu_object_type_t zp_type; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; + boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; @@ -432,6 +438,12 @@ typedef struct zio_link { list_node_t zl_child_node; } zio_link_t; +enum zio_qstate { + ZIO_QS_NONE = 0, + ZIO_QS_QUEUED, + ZIO_QS_ACTIVE, +}; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -439,7 +451,6 @@ struct zio { zio_type_t io_type; enum zio_child io_child_type; enum trim_flag io_trim_flags; - int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; @@ -456,7 +467,6 @@ struct zio { /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_children_ready; - zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ @@ -476,6 +486,12 @@ struct zio { const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ + enum zio_qstate io_queue_state; /* vdev queue state */ + union { + list_node_t l; + avl_node_t a; + } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */ + avl_node_t io_offset_node; /* vdev offset queues */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; @@ -483,25 +499,19 @@ struct zio { hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ - avl_node_t io_queue_node; - avl_node_t io_offset_node; - avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ - enum zio_flag io_flags; + zio_flag_t io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; - enum zio_flag io_orig_flags; + zio_flag_t io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; enum zio_stage io_pipeline_trace; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; - uint64_t io_child_count; - uint64_t io_phys_children; - uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -526,57 +536,61 @@ enum blk_verify_flag { BLK_VERIFY_HALT }; +enum blk_config_flag { + BLK_CONFIG_HELD, // SCL_VDEV held for writer + BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader + BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV +}; + extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, - zio_done_func_t *done, void *priv, enum zio_flag flags); + zio_done_func_t *done, void *priv, zio_flag_t flags); extern zio_t *zio_root(spa_t *spa, - zio_done_func_t *done, void *priv, enum zio_flag flags); + zio_done_func_t *done, void *priv, zio_flag_t flags); + +extern void zio_destroy(zio_t *zio); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, - zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); + zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *priv, zio_priority_t priority, enum zio_flag flags, - const zbookmark_phys_t *zb); + zio_done_func_t *done, void *priv, zio_priority_t priority, + zio_flag_t flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, - zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); + zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, - boolean_t nopwrite); + boolean_t nopwrite, boolean_t brtwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio_done_func_t *done, void *priv, enum zio_flag flags); - -extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *priv, enum zio_flag flags); + zio_done_func_t *done, void *priv, zio_flag_t flags); extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, - enum zio_flag flags, enum trim_flag trim_flags); + zio_flag_t flags, enum trim_flag trim_flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, - enum zio_flag flags, boolean_t labels); + zio_flag_t flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, - enum zio_flag flags, boolean_t labels); + zio_flag_t flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, - const blkptr_t *bp, enum zio_flag flags); + const blkptr_t *bp, zio_flag_t flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog); @@ -595,6 +609,7 @@ extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); +extern void zio_add_child_first(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); @@ -609,12 +624,12 @@ extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, int type, - zio_priority_t priority, enum zio_flag flags, + zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *priv); + zio_flag_t flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); @@ -639,7 +654,7 @@ extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, - boolean_t config_held, enum blk_verify_flag blk_verify); + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* * Initial setup and teardown. @@ -668,6 +683,8 @@ extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); +extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); +extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); /* * Checksum ereport functions @@ -696,6 +713,8 @@ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); +boolean_t zbookmark_subtree_tbd(const struct dnode_phys *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 989f125e6afd..37fd65b7cb3e 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -94,15 +94,13 @@ typedef const struct zio_checksum_info { } zio_checksum_info_t; typedef struct zio_bad_cksum { - zio_cksum_t zbc_expected; - zio_cksum_t zbc_actual; const char *zbc_checksum_name; uint8_t zbc_byteswapped; uint8_t zbc_injected; uint8_t zbc_has_cksum; /* expected/actual valid */ } zio_bad_cksum_t; -_SYS_ZIO_CHECKSUM_H const zio_checksum_info_t +_SYS_ZIO_CHECKSUM_H zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* @@ -110,9 +108,9 @@ _SYS_ZIO_CHECKSUM_H const zio_checksum_info_t */ /* SHA2 */ -extern zio_checksum_t abd_checksum_SHA256; -extern zio_checksum_t abd_checksum_SHA512_native; -extern zio_checksum_t abd_checksum_SHA512_byteswap; +extern zio_checksum_t abd_checksum_sha256; +extern zio_checksum_t abd_checksum_sha512_native; +extern zio_checksum_t abd_checksum_sha512_byteswap; /* Skein */ extern zio_checksum_t abd_checksum_skein_native; diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 26600b43bb49..691d7b624488 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -152,7 +152,7 @@ typedef const struct zio_compress_info { zio_decompresslevel_func_t *ci_decompress_level; } zio_compress_info_t; -extern const zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; +extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; /* * lz4 compression init & free @@ -183,7 +183,7 @@ extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, /* * Compress and decompress data if necessary. */ -extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len, uint8_t level); extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, size_t d_len, uint8_t *level); diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 4c998571653a..2b026d48675a 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -25,6 +25,7 @@ /* * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2024, Klara Inc. */ #ifndef _ZIO_IMPL_H @@ -39,7 +40,7 @@ extern "C" { * * The ZFS I/O pipeline is comprised of various stages which are defined * in the zio_stage enum below. The individual stages are used to construct - * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. + * these basic I/O operations: Read, Write, Free, Claim, Flush and Trim. * * I/O operations: (XXX - provide detail for each of the operations) * @@ -47,7 +48,8 @@ extern "C" { * Write: * Free: * Claim: - * Ioctl: + * Flush: + * Trim: * * Although the most common pipeline are used by the basic I/O operations * above, there are some helper pipelines (one could consider them @@ -77,6 +79,12 @@ extern "C" { * and zstd. Compression occurs as part of the write pipeline and is * performed in the ZIO_STAGE_WRITE_BP_INIT stage. * + * Block cloning: + * The block cloning functionality introduces ZIO_STAGE_BRT_FREE stage which + * is called during a free pipeline. If the block is referenced in the + * Block Cloning Table (BRT) we will just decrease its reference counter + * instead of actually freeing the block. + * * Dedup: * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing @@ -114,43 +122,48 @@ extern "C" { * zio pipeline stage definitions */ enum zio_stage { - ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ + ZIO_STAGE_OPEN = 1 << 0, /* RWFCXT */ + + ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R----- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W---- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F--- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* -WF--T */ + ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W---- */ - ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ - ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ - ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ - ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ - ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ + ZIO_STAGE_ENCRYPT = 1 << 6, /* -W---- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W---- */ - ZIO_STAGE_ENCRYPT = 1 << 6, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W--- */ + ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W---- */ - ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */ + ZIO_STAGE_BRT_FREE = 1 << 9, /* --F--- */ - ZIO_STAGE_DDT_READ_START = 1 << 9, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 12, /* --F-- */ + ZIO_STAGE_DDT_READ_START = 1 << 10, /* R----- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R----- */ + ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W---- */ + ZIO_STAGE_DDT_FREE = 1 << 13, /* --F--- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 13, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 14, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC-- */ + ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC-- */ - ZIO_STAGE_DVA_THROTTLE = 1 << 15, /* -W--- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 16, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 17, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 18, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W---- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W---- */ + ZIO_STAGE_DVA_FREE = 1 << 18, /* --F--- */ + ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C-- */ - ZIO_STAGE_READY = 1 << 19, /* RWFCI */ + ZIO_STAGE_READY = 1 << 20, /* RWFCXT */ - ZIO_STAGE_VDEV_IO_START = 1 << 20, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 21, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 22, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--XT */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--XT */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 23, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */ - ZIO_STAGE_DONE = 1 << 24 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 25 /* RWFCXT */ }; +#define ZIO_ROOT_PIPELINE \ + ZIO_STAGE_DONE + #define ZIO_INTERLOCK_STAGES \ (ZIO_STAGE_READY | \ ZIO_STAGE_DONE) @@ -233,6 +246,7 @@ enum zio_stage { #define ZIO_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_BRT_FREE | \ ZIO_STAGE_DVA_FREE) #define ZIO_DDT_FREE_PIPELINE \ @@ -245,10 +259,9 @@ enum zio_stage { (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_DVA_CLAIM) -#define ZIO_IOCTL_PIPELINE \ +#define ZIO_FLUSH_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ - ZIO_STAGE_VDEV_IO_START | \ - ZIO_STAGE_VDEV_IO_ASSESS) + ZIO_VDEV_IO_STAGES) #define ZIO_TRIM_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ diff --git a/include/sys/zrlock.h b/include/sys/zrlock.h index b6eba1a18ff4..e2a7a254a6e0 100644 --- a/include/sys/zrlock.h +++ b/include/sys/zrlock.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -34,9 +34,8 @@ extern "C" { typedef struct zrlock { kmutex_t zr_mtx; - volatile int32_t zr_refcount; kcondvar_t zr_cv; - uint16_t zr_pad; + volatile int32_t zr_refcount; #ifdef ZFS_DEBUG kthread_t *zr_owner; const char *zr_caller; diff --git a/include/sys/zvol.h b/include/sys/zvol.h index a0f18001304e..c79fe1d9ad22 100644 --- a/include/sys/zvol.h +++ b/include/sys/zvol.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -50,8 +50,9 @@ extern int zvol_get_stats(objset_t *, nvlist_t *); extern boolean_t zvol_is_zvol(const char *); extern void zvol_create_cb(objset_t *, void *, cred_t *, dmu_tx_t *); extern int zvol_set_volsize(const char *, uint64_t); -extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t); -extern int zvol_set_volmode(const char *, zprop_source_t, uint64_t); +extern int zvol_set_volthreading(const char *, boolean_t); +extern int zvol_set_common(const char *, zfs_prop_t, zprop_source_t, uint64_t); +extern int zvol_set_ro(const char *, boolean_t); extern zvol_state_handle_t *zvol_suspend(const char *); extern int zvol_resume(zvol_state_handle_t *); extern void *zvol_tag(zvol_state_handle_t *); diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 94203347066b..6c15c84b6bf4 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -58,6 +58,7 @@ typedef struct zvol_state { atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ struct zvol_state_os *zv_zso; /* private platform state */ + boolean_t zv_threading; /* volthreading property */ } zvol_state_t; @@ -81,9 +82,9 @@ void zvol_remove_minors_impl(const char *name); void zvol_last_close(zvol_state_t *zv); void zvol_insert(zvol_state_t *zv); void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, - uint64_t len, boolean_t sync); + uint64_t len); void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, - uint64_t size, int sync); + uint64_t size, boolean_t commit); int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio); int zvol_init_impl(void); |