diff options
Diffstat (limited to 'sys/contrib/openzfs/include')
90 files changed, 947 insertions, 571 deletions
diff --git a/sys/contrib/openzfs/include/Makefile.am b/sys/contrib/openzfs/include/Makefile.am index a9258deabfd7..7588cd0aedc9 100644 --- a/sys/contrib/openzfs/include/Makefile.am +++ b/sys/contrib/openzfs/include/Makefile.am @@ -10,6 +10,7 @@ COMMON_H = \ cityhash.h \ zfeature_common.h \ zfs_comutil.h \ + zfs_crrd.h \ zfs_deleg.h \ zfs_fletcher.h \ zfs_namecheck.h \ @@ -69,7 +70,6 @@ COMMON_H = \ sys/metaslab_impl.h \ sys/mmp.h \ sys/mntent.h \ - sys/mod.h \ sys/multilist.h \ sys/nvpair.h \ sys/nvpair_impl.h \ diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index 8774d490f74b..14930fb90622 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -30,6 +30,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> + * Copyright (c) 2025 Hewlett Packard Enterprise Development LP. */ #ifndef _LIBZFS_H @@ -288,10 +289,22 @@ typedef struct trimflags { uint64_t rate; } trimflags_t; +typedef struct trim_cbdata { + trimflags_t trim_flags; + pool_trim_func_t cmd_type; +} trim_cbdata_t; + +typedef struct initialize_cbdata { + boolean_t wait; + pool_initialize_func_t cmd_type; +} initialize_cbdata_t; /* * Functions to manipulate pool and vdev state */ _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); +_LIBZFS_H int zpool_scan_range(zpool_handle_t *, pool_scan_func_t, + pool_scrub_cmd_t, time_t, time_t); +_LIBZFS_H int zpool_initialize_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, @@ -304,7 +317,9 @@ _LIBZFS_H int zpool_reguid(zpool_handle_t *); _LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *); _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); +_LIBZFS_H void zpool_collect_leaves(zpool_handle_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_trim_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, uint64_t); @@ -464,6 +479,8 @@ _LIBZFS_H zpool_status_t zpool_import_status(nvlist_t *, const char **, _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); _LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); _LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); +_LIBZFS_H void zpool_refresh_stats_from_handle(zpool_handle_t *, + zpool_handle_t *); _LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); _LIBZFS_H void zpool_add_propname(zpool_handle_t *, const char *); @@ -591,6 +608,7 @@ _LIBZFS_H int zfs_crypto_attempt_load_keys(libzfs_handle_t *, const char *); _LIBZFS_H int zfs_crypto_load_key(zfs_handle_t *, boolean_t, const char *); _LIBZFS_H int zfs_crypto_unload_key(zfs_handle_t *); _LIBZFS_H int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); +_LIBZFS_H boolean_t zfs_is_encrypted(zfs_handle_t *); typedef struct zprop_list { int pl_prop; @@ -853,7 +871,7 @@ _LIBZFS_H uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, - uid_t rid, uint64_t space); + uid_t rid, uint64_t space, uint64_t default_quota); _LIBZFS_H int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, zfs_userspace_cb_t, void *); diff --git a/sys/contrib/openzfs/include/os/freebsd/Makefile.am b/sys/contrib/openzfs/include/os/freebsd/Makefile.am index d975c4fe69fa..d6b6923d033f 100644 --- a/sys/contrib/openzfs/include/os/freebsd/Makefile.am +++ b/sys/contrib/openzfs/include/os/freebsd/Makefile.am @@ -33,7 +33,7 @@ noinst_HEADERS = \ %D%/spl/sys/list_impl.h \ %D%/spl/sys/lock.h \ %D%/spl/sys/misc.h \ - %D%/spl/sys/mod_os.h \ + %D%/spl/sys/mod.h \ %D%/spl/sys/mode.h \ %D%/spl/sys/mount.h \ %D%/spl/sys/mutex.h \ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h index 112f7bc32849..51238dd1c8fd 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h @@ -43,10 +43,7 @@ extern "C" { #endif #define EXPORT_SYMBOL(x) -#define module_param(a, b, c) -#define module_param_call(a, b, c, d, e) -#define module_param_named(a, b, c, d) -#define MODULE_PARM_DESC(a, b) + #define asm __asm #ifdef ZFS_DEBUG #undef NDEBUG diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h index c1a7cfdeca51..32bc02f3dc86 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h @@ -69,6 +69,10 @@ #define __maybe_unused __attribute__((unused)) #endif +#ifndef __must_check +#define __must_check __attribute__((__warn_unused_result__)) +#endif + /* * Without this, we see warnings from objtool during normal Linux builds when * the kernel is built with CONFIG_STACK_VALIDATION=y: @@ -112,14 +116,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) } while (0) #define VERIFY3B(LEFT, OP, RIGHT) do { \ - const boolean_t _verify3_left = (boolean_t)(LEFT); \ - const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + const boolean_t _verify3_left = (boolean_t)!!(LEFT); \ + const boolean_t _verify3_right = (boolean_t)!!(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3B(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%d " #OP " %d)\n", \ - (boolean_t)_verify3_left, \ - (boolean_t)_verify3_right); \ + _verify3_left, _verify3_right); \ } while (0) #define VERIFY3S(LEFT, OP, RIGHT) do { \ @@ -127,7 +130,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const int64_t _verify3_right = (int64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3S(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%lld " #OP " %lld)\n", \ (long long)_verify3_left, \ (long long)_verify3_right); \ @@ -138,7 +141,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uint64_t _verify3_right = (uint64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3U(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%llu " #OP " %llu)\n", \ (unsigned long long)_verify3_left, \ (unsigned long long)_verify3_right); \ @@ -149,8 +152,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%px " #OP " %px)\n", \ + "VERIFY3P(" #LEFT ", " #OP ", " #RIGHT ") " \ + "failed (%p " #OP " %p)\n", \ (void *)_verify3_left, \ (void *)_verify3_right); \ } while (0) @@ -159,8 +162,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const int64_t _verify0_right = (int64_t)(RIGHT); \ if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(" #RIGHT ") " \ - "failed (0 == %lld)\n", \ + "VERIFY0(" #RIGHT ") failed (%lld)\n", \ (long long)_verify0_right); \ } while (0) @@ -168,8 +170,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uintptr_t _verify0_right = (uintptr_t)(RIGHT); \ if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0P(" #RIGHT ") " \ - "failed (NULL == %px)\n", \ + "VERIFY0P(" #RIGHT ") failed (%p)\n", \ (void *)_verify0_right); \ } while (0) @@ -182,14 +183,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) */ #define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) do { \ - const boolean_t _verify3_left = (boolean_t)(LEFT); \ - const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + const boolean_t _verify3_left = (boolean_t)!!(LEFT); \ + const boolean_t _verify3_right = (boolean_t)!!(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3B(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%d " #OP " %d) " STR "\n", \ - (boolean_t)(_verify3_left), \ - (boolean_t)(_verify3_right), \ + _verify3_left, _verify3_right, \ __VA_ARGS__); \ } while (0) @@ -198,10 +198,9 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const int64_t _verify3_right = (int64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3S(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%lld " #OP " %lld) " STR "\n", \ - (long long)(_verify3_left), \ - (long long)(_verify3_right), \ + (long long)_verify3_left, (long long)_verify3_right,\ __VA_ARGS__); \ } while (0) @@ -210,10 +209,10 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uint64_t _verify3_right = (uint64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3U(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%llu " #OP " %llu) " STR "\n", \ - (unsigned long long)(_verify3_left), \ - (unsigned long long)(_verify3_right), \ + (unsigned long long)_verify3_left, \ + (unsigned long long)_verify3_right, \ __VA_ARGS__); \ } while (0) @@ -222,32 +221,27 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%px " #OP " %px) " STR "\n", \ - (void *) (_verify3_left), \ - (void *) (_verify3_right), \ + "VERIFY3P(" #LEFT ", " #OP ", " #RIGHT ") " \ + "failed (%p " #OP " %p) " STR "\n", \ + (void *)_verify3_left, (void *)_verify3_right, \ __VA_ARGS__); \ } while (0) #define VERIFY0PF(RIGHT, STR, ...) do { \ - const uintptr_t _verify3_left = (uintptr_t)(0); \ const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ - if (unlikely(!(_verify3_left == _verify3_right))) \ + if (unlikely(!(0 == _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(0 == " #RIGHT ") " \ - "failed (0 == %px) " STR "\n", \ - (long long) (_verify3_right), \ + "VERIFY0P(" #RIGHT ") failed (%p) " STR "\n", \ + (void *)_verify3_right, \ __VA_ARGS__); \ } while (0) #define VERIFY0F(RIGHT, STR, ...) do { \ - const int64_t _verify3_left = (int64_t)(0); \ const int64_t _verify3_right = (int64_t)(RIGHT); \ - if (unlikely(!(_verify3_left == _verify3_right))) \ + if (unlikely(!(0 == _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(0 == " #RIGHT ") " \ - "failed (0 == %lld) " STR "\n", \ - (long long) (_verify3_right), \ + "VERIFY0(" #RIGHT ") failed (%lld) " STR "\n", \ + (long long)_verify3_right, \ __VA_ARGS__); \ } while (0) @@ -256,10 +250,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_assert("(" #A ") implies (" #B ")", \ __FILE__, __FUNCTION__, __LINE__))) -#define VERIFY_EQUIV(A, B) \ - ((void)(likely(!!(A) == !!(B)) || \ - spl_assert("(" #A ") is equivalent to (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) +#define VERIFY_EQUIV(A, B) VERIFY3B(A, ==, B) /* * Debugging disabled (--disable-debug) diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h index 091ebe772810..acce8734b2c5 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h @@ -56,4 +56,9 @@ struct opensolaris_utsname { #define task_io_account_read(n) #define task_io_account_write(n) +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod.h index 4214189c32df..4214189c32df 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod.h diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h index 639ade831c28..48bc4f3d5b0f 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h @@ -39,7 +39,6 @@ struct znode; int secpolicy_nfs(cred_t *cr); int secpolicy_zfs(cred_t *crd); -int secpolicy_zfs_proc(cred_t *cr, proc_t *proc); int secpolicy_sys_config(cred_t *cr, int checkonly); int secpolicy_zinject(cred_t *cr); int secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp); diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h index a03b815a22a6..1cbd79ec893f 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h @@ -45,7 +45,9 @@ #ifdef _KERNEL #define CPU curcpu #define minclsyspri PRIBIO -#define defclsyspri minclsyspri +#define defclsyspri minclsyspri +/* Write issue taskq priority. */ +#define wtqclsyspri ((PVM + PRIBIO) / 2) #define maxclsyspri PVM #define max_ncpus (mp_maxid + 1) #define boot_max_ncpus (mp_maxid + 1) @@ -75,8 +77,8 @@ do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, /* * Be sure there are no surprises. */ - ASSERT(stk == NULL); - ASSERT(len == 0); + ASSERT0P(stk); + ASSERT0(len); ASSERT(state == TS_RUN); if (pp == &p0) diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/simd_powerpc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/simd_powerpc.h index 0be9257e40cb..5596f35a66d1 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/simd_powerpc.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/simd_powerpc.h @@ -53,11 +53,11 @@ #define kfpu_allowed() 1 #define kfpu_initialize(tsk) do {} while (0) -#define kfpu_begin() { \ +#define kfpu_begin() { \ if (__predict_false(!is_fpu_kern_thread(0))) \ fpu_kern_enter(PCPU_GET(curthread), NULL, FPU_KERN_NOCTX);\ } -#define kfpu_end() { \ +#define kfpu_end() { \ if (__predict_false(PCPU_GET(curpcb)->pcb_flags & PCB_KERN_FPU_NOSAVE))\ fpu_kern_leave(PCPU_GET(curthread), NULL); \ } diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h index 2f5fe4619ef7..14b42f2e7087 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h @@ -63,6 +63,17 @@ typedef longlong_t hrtime_t; #define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) static __inline hrtime_t +getlrtime(void) +{ + struct timespec ts; + hrtime_t nsec; + + getnanouptime(&ts); + nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; + return (nsec); +} + +static __inline hrtime_t gethrtime(void) { struct timespec ts; diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h index 454078f0fe79..d36bee881d0b 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h @@ -35,6 +35,7 @@ extern const int zfs_vm_pagerret_bad; extern const int zfs_vm_pagerret_error; extern const int zfs_vm_pagerret_ok; +extern const int zfs_vm_pagerret_pend; extern const int zfs_vm_pagerput_sync; extern const int zfs_vm_pagerput_inval; diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h index 0df3378c23e7..b18836aa563e 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h @@ -227,6 +227,7 @@ struct taskq; #define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ #define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ #define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ +#define LOOKUP_NAMED_ATTR 0x10 /* Lookup a named attribute */ /* * Public vnode manipulation functions. diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index 289b64759382..3ed311d49cc6 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -96,6 +96,12 @@ struct zfsvfs { uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; + uint64_t z_defaultuserquota; + uint64_t z_defaultgroupquota; + uint64_t z_defaultprojectquota; + uint64_t z_defaultuserobjquota; + uint64_t z_defaultgroupobjquota; + uint64_t z_defaultprojectobjquota; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 @@ -226,6 +232,8 @@ extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); extern int zfs_busy(void); +extern int zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t zfs_prop, + uint64_t quota); #ifdef __cplusplus } diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h index b292818750d9..15e3affba0e8 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -29,6 +29,7 @@ #ifndef _FREEBSD_ZFS_SYS_ZNODE_IMPL_H #define _FREEBSD_ZFS_SYS_ZNODE_IMPL_H +#ifdef _KERNEL #include <sys/list.h> #include <sys/dmu.h> #include <sys/sa.h> @@ -42,6 +43,7 @@ #include <sys/zfs_project.h> #include <vm/vm_object.h> #include <sys/uio.h> +#endif #ifdef __cplusplus extern "C" { @@ -54,7 +56,7 @@ extern "C" { */ #define ZNODE_OS_FIELDS \ struct zfsvfs *z_zfsvfs; \ - vnode_t *z_vnode; \ + struct vnode *z_vnode; \ char *z_cached_symlink; \ uint64_t z_uid; \ uint64_t z_gid; \ @@ -62,6 +64,8 @@ extern "C" { uint64_t z_atime[2]; \ uint64_t z_links; +#ifdef _KERNEL + #define ZFS_LINK_MAX UINT64_MAX /* @@ -183,6 +187,9 @@ extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, char *buf, uint64_t buflen); extern int zfs_rlimit_fsize(off_t fsize); + +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/os/linux/Makefile.am b/sys/contrib/openzfs/include/os/linux/Makefile.am index b7bdd892ec1d..e156ca183dbd 100644 --- a/sys/contrib/openzfs/include/os/linux/Makefile.am +++ b/sys/contrib/openzfs/include/os/linux/Makefile.am @@ -8,6 +8,7 @@ kernel_linux_HEADERS = \ %D%/kernel/linux/mm_compat.h \ %D%/kernel/linux/mod_compat.h \ %D%/kernel/linux/page_compat.h \ + %D%/kernel/linux/pagemap_compat.h \ %D%/kernel/linux/simd.h \ %D%/kernel/linux/simd_aarch64.h \ %D%/kernel/linux/simd_arm.h \ @@ -74,7 +75,7 @@ kernel_spl_sys_HEADERS = \ %D%/spl/sys/kstat.h \ %D%/spl/sys/list.h \ %D%/spl/sys/misc.h \ - %D%/spl/sys/mod_os.h \ + %D%/spl/sys/mod.h \ %D%/spl/sys/mutex.h \ %D%/spl/sys/param.h \ %D%/spl/sys/proc.h \ diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h index 076dab8ba6dc..214f3ea0e787 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h @@ -542,24 +542,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ -/* - * All the io_*() helper functions below can operate on a bio, or a rq, but - * not both. The older submit_bio() codepath will pass a bio, and the - * newer blk-mq codepath will pass a rq. - */ -static inline int -io_data_dir(struct bio *bio, struct request *rq) -{ - if (rq != NULL) { - if (op_is_write(req_op(rq))) { - return (WRITE); - } else { - return (READ); - } - } - return (bio_data_dir(bio)); -} - static inline int io_is_flush(struct bio *bio, struct request *rq) { diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h index 16e8a319a5f8..152e5a606f0e 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h @@ -61,32 +61,6 @@ #endif /* - * 2.6.30 API change, - * The const keyword was added to the 'struct dentry_operations' in - * the dentry structure. To handle this we define an appropriate - * dentry_operations_t typedef which can be used. - */ -typedef const struct dentry_operations dentry_operations_t; - -/* - * 2.6.38 API addition, - * Added d_clear_d_op() helper function which clears some flags and the - * registered dentry->d_op table. This is required because d_set_d_op() - * issues a warning when the dentry operations table is already set. - * For the .zfs control directory to work properly we must be able to - * override the default operations table and register custom .d_automount - * and .d_revalidate callbacks. - */ -static inline void -d_clear_d_op(struct dentry *dentry) -{ - dentry->d_op = NULL; - dentry->d_flags &= ~( - DCACHE_OP_HASH | DCACHE_OP_COMPARE | - DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); -} - -/* * Walk and invalidate all dentry aliases of an inode * unless it's a mountpoint */ diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h index 110cdfa259be..e49ada399694 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h @@ -31,15 +31,6 @@ #include <linux/module.h> #include <linux/moduleparam.h> -/* - * Despite constifying struct kernel_param_ops, some older kernels define a - * `__check_old_set_param()` function in their headers that checks for a - * non-constified `->set()`. This has long been fixed in Linux mainline, but - * since we support older kernels, we workaround it by using a preprocessor - * definition to disable it. - */ -#define __check_old_set_param(_) (0) - typedef const struct kernel_param zfs_kernel_param_t; #define ZMOD_RW 0644 @@ -72,6 +63,7 @@ enum scope_prefix_types { zfs_vdev_disk, zfs_vdev_file, zfs_vdev_mirror, + zfs_vol, zfs_vnops, zfs_zevent, zfs_zio, @@ -79,48 +71,23 @@ enum scope_prefix_types { }; /* - * While we define our own s64/u64 types, there is no reason to reimplement the - * existing Linux kernel types, so we use the preprocessor to remap our - * "custom" implementations to the kernel ones. This is done because the CPP - * does not allow us to write conditional definitions. The fourth definition - * exists because the CPP will not allow us to replace things like INT with int - * before string concatenation. + * Our uint64 params are called U64 in part because we had them before Linux + * provided ULLONG param ops. Now it does, and we use them, but we retain the + * U64 name to keep many existing tunables working without issue. */ +#define spl_param_set_u64 param_set_ullong +#define spl_param_get_u64 param_get_ullong +#define spl_param_ops_U64 param_ops_ullong -#define spl_param_set_int param_set_int -#define spl_param_get_int param_get_int -#define spl_param_ops_int param_ops_int -#define spl_param_ops_INT param_ops_int - -#define spl_param_set_long param_set_long -#define spl_param_get_long param_get_long -#define spl_param_ops_long param_ops_long -#define spl_param_ops_LONG param_ops_long - -#define spl_param_set_uint param_set_uint -#define spl_param_get_uint param_get_uint -#define spl_param_ops_uint param_ops_uint -#define spl_param_ops_UINT param_ops_uint - -#define spl_param_set_ulong param_set_ulong -#define spl_param_get_ulong param_get_ulong -#define spl_param_ops_ulong param_ops_ulong -#define spl_param_ops_ULONG param_ops_ulong - -#define spl_param_set_charp param_set_charp -#define spl_param_get_charp param_get_charp -#define spl_param_ops_charp param_ops_charp -#define spl_param_ops_STRING param_ops_charp - -int spl_param_set_s64(const char *val, zfs_kernel_param_t *kp); -extern int spl_param_get_s64(char *buffer, zfs_kernel_param_t *kp); -extern const struct kernel_param_ops spl_param_ops_s64; -#define spl_param_ops_S64 spl_param_ops_s64 - -extern int spl_param_set_u64(const char *val, zfs_kernel_param_t *kp); -extern int spl_param_get_u64(char *buffer, zfs_kernel_param_t *kp); -extern const struct kernel_param_ops spl_param_ops_u64; -#define spl_param_ops_U64 spl_param_ops_u64 +/* + * We keep our own names for param ops to make expanding them in + * ZFS_MODULE_PARAM easy. + */ +#define spl_param_ops_INT param_ops_int +#define spl_param_ops_LONG param_ops_long +#define spl_param_ops_UINT param_ops_uint +#define spl_param_ops_ULONG param_ops_ulong +#define spl_param_ops_STRING param_ops_charp /* * Declare a module parameter / sysctl node diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/page_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/page_compat.h index 963b96ba6351..7dcf53bbea47 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/page_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/page_compat.h @@ -4,8 +4,8 @@ /* * Create our own accessor functions to follow the Linux API changes */ -#define nr_file_pages() global_node_page_state(NR_FILE_PAGES) -#define nr_inactive_anon_pages() global_node_page_state(NR_INACTIVE_ANON) +#define nr_file_pages() (global_node_page_state(NR_ACTIVE_FILE) + \ + global_node_page_state(NR_INACTIVE_FILE)) #define nr_inactive_file_pages() global_node_page_state(NR_INACTIVE_FILE) #endif /* _ZFS_PAGE_COMPAT_H */ diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/pagemap_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/pagemap_compat.h new file mode 100644 index 000000000000..a0465ede0105 --- /dev/null +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/pagemap_compat.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ + +#ifndef _ZFS_PAGEMAP_COMPAT_H +#define _ZFS_PAGEMAP_COMPAT_H + +#include <linux/pagemap.h> + +#ifndef HAVE_PAGEMAP_READAHEAD_PAGE +#define readahead_page(ractl) (&(__readahead_folio(ractl)->page)) +#endif + +#endif diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h index cd245a5f0135..326f471d7c9b 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h @@ -139,15 +139,6 @@ */ #if defined(HAVE_KERNEL_FPU_INTERNAL) -/* - * For kernels not exporting *kfpu_{begin,end} we have to use inline assembly - * with the XSAVE{,OPT,S} instructions, so we need the toolchain to support at - * least XSAVE. - */ -#if !defined(HAVE_XSAVE) -#error "Toolchain needs to support the XSAVE assembler instruction" -#endif - #ifndef XFEATURE_MASK_XTILE /* * For kernels where this doesn't exist yet, we still don't want to break @@ -335,9 +326,13 @@ kfpu_begin(void) return; } #endif +#if defined(HAVE_XSAVE) if (static_cpu_has(X86_FEATURE_XSAVE)) { kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE); - } else if (static_cpu_has(X86_FEATURE_FXSR)) { + return; + } +#endif + if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_save_fxsr(state); } else { kfpu_save_fsave(state); @@ -390,9 +385,13 @@ kfpu_end(void) goto out; } #endif +#if defined(HAVE_XSAVE) if (static_cpu_has(X86_FEATURE_XSAVE)) { kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE); - } else if (static_cpu_has(X86_FEATURE_FXSR)) { + goto out; + } +#endif + if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_restore_fxsr(state); } else { kfpu_restore_fsave(state); @@ -599,6 +598,32 @@ zfs_movbe_available(void) } /* + * Check if VAES instruction set is available + */ +static inline boolean_t +zfs_vaes_available(void) +{ +#if defined(X86_FEATURE_VAES) + return (!!boot_cpu_has(X86_FEATURE_VAES)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if VPCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_vpclmulqdq_available(void) +{ +#if defined(X86_FEATURE_VPCLMULQDQ) + return (!!boot_cpu_has(X86_FEATURE_VPCLMULQDQ)); +#else + return (B_FALSE); +#endif +} + +/* * Check if SHA_NI instruction set is available */ static inline boolean_t diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h b/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h index b2a39d7d6cbf..f4bcd58bd281 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h @@ -71,6 +71,22 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) return ((void *)atomic_cas_64((volatile uint64_t *)target, (uint64_t)cmp, (uint64_t)newval)); } +static __inline__ void * +atomic_swap_ptr(volatile void *target, void *newval) +{ + return ((void *)atomic_swap_64((volatile uint64_t *)target, + (uint64_t)newval)); +} +static __inline__ void * +atomic_load_ptr(volatile void *target) +{ + return ((void *)atomic_load_64((volatile uint64_t *)target)); +} +static __inline__ void +atomic_store_ptr(volatile void *target, void *newval) +{ + atomic_store_64((volatile uint64_t *)target, (uint64_t)newval); +} #else /* _LP64 */ static __inline__ void * atomic_cas_ptr(volatile void *target, void *cmp, void *newval) @@ -78,6 +94,22 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) return ((void *)atomic_cas_32((volatile uint32_t *)target, (uint32_t)cmp, (uint32_t)newval)); } +static __inline__ void * +atomic_swap_ptr(volatile void *target, void *newval) +{ + return ((void *)atomic_swap_32((volatile uint32_t *)target, + (uint32_t)newval)); +} +static __inline__ void * +atomic_load_ptr(volatile void *target) +{ + return ((void *)atomic_load_32((volatile uint32_t *)target)); +} +static __inline__ void +atomic_store_ptr(volatile void *target, void *newval) +{ + atomic_store_32((volatile uint32_t *)target, (uint32_t)newval); +} #endif /* _LP64 */ #endif /* _SPL_ATOMIC_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h index 700cc85b60b6..85b96e1e23a7 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h @@ -69,6 +69,10 @@ #define __maybe_unused __attribute__((unused)) #endif +#ifndef __must_check +#define __must_check __attribute__((__warn_unused_result__)) +#endif + /* * Without this, we see warnings from objtool during normal Linux builds when * the kernel is built with CONFIG_STACK_VALIDATION=y: @@ -116,14 +120,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) } while (0) #define VERIFY3B(LEFT, OP, RIGHT) do { \ - const boolean_t _verify3_left = (boolean_t)(LEFT); \ - const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + const boolean_t _verify3_left = (boolean_t)!!(LEFT); \ + const boolean_t _verify3_right = (boolean_t)!!(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3B(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%d " #OP " %d)\n", \ - (boolean_t)_verify3_left, \ - (boolean_t)_verify3_right); \ + _verify3_left, _verify3_right); \ } while (0) #define VERIFY3S(LEFT, OP, RIGHT) do { \ @@ -131,7 +134,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const int64_t _verify3_right = (int64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3S(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%lld " #OP " %lld)\n", \ (long long)_verify3_left, \ (long long)_verify3_right); \ @@ -142,7 +145,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uint64_t _verify3_right = (uint64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3U(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%llu " #OP " %llu)\n", \ (unsigned long long)_verify3_left, \ (unsigned long long)_verify3_right); \ @@ -153,7 +156,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3P(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%px " #OP " %px)\n", \ (void *)_verify3_left, \ (void *)_verify3_right); \ @@ -163,8 +166,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const int64_t _verify0_right = (int64_t)(RIGHT); \ if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(" #RIGHT ") " \ - "failed (0 == %lld)\n", \ + "VERIFY0(" #RIGHT ") failed (%lld)\n", \ (long long)_verify0_right); \ } while (0) @@ -172,8 +174,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uintptr_t _verify0_right = (uintptr_t)(RIGHT); \ if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0P(" #RIGHT ") " \ - "failed (NULL == %px)\n", \ + "VERIFY0P(" #RIGHT ") failed (%px)\n", \ (void *)_verify0_right); \ } while (0) @@ -186,14 +187,13 @@ spl_assert(const char *buf, const char *file, const char *func, int line) */ #define VERIFY3BF(LEFT, OP, RIGHT, STR, ...) do { \ - const boolean_t _verify3_left = (boolean_t)(LEFT); \ - const boolean_t _verify3_right = (boolean_t)(RIGHT); \ + const boolean_t _verify3_left = (boolean_t)!!(LEFT); \ + const boolean_t _verify3_right = (boolean_t)!!(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3B(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%d " #OP " %d) " STR "\n", \ - (boolean_t)(_verify3_left), \ - (boolean_t)(_verify3_right), \ + _verify3_left, _verify3_right, \ __VA_ARGS__); \ } while (0) @@ -202,10 +202,9 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const int64_t _verify3_right = (int64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3S(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%lld " #OP " %lld) " STR "\n", \ - (long long)(_verify3_left), \ - (long long)(_verify3_right), \ + (long long)_verify3_left, (long long)_verify3_right,\ __VA_ARGS__); \ } while (0) @@ -214,10 +213,10 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uint64_t _verify3_right = (uint64_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3U(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%llu " #OP " %llu) " STR "\n", \ - (unsigned long long)(_verify3_left), \ - (unsigned long long)(_verify3_right), \ + (unsigned long long)_verify3_left, \ + (unsigned long long)_verify3_right, \ __VA_ARGS__); \ } while (0) @@ -226,32 +225,27 @@ spl_assert(const char *buf, const char *file, const char *func, int line) const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "VERIFY3P(" #LEFT ", " #OP ", " #RIGHT ") " \ "failed (%px " #OP " %px) " STR "\n", \ - (void *) (_verify3_left), \ - (void *) (_verify3_right), \ + (void *)_verify3_left, (void *)_verify3_right, \ __VA_ARGS__); \ } while (0) #define VERIFY0PF(RIGHT, STR, ...) do { \ - const uintptr_t _verify3_left = (uintptr_t)(0); \ const uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ - if (unlikely(!(_verify3_left == _verify3_right))) \ + if (unlikely(!(0 == _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(0 == " #RIGHT ") " \ - "failed (0 == %px) " STR "\n", \ - (long long) (_verify3_right), \ + "VERIFY0P(" #RIGHT ") failed (%px) " STR "\n", \ + (void *)_verify3_right, \ __VA_ARGS__); \ } while (0) #define VERIFY0F(RIGHT, STR, ...) do { \ - const int64_t _verify3_left = (int64_t)(0); \ const int64_t _verify3_right = (int64_t)(RIGHT); \ - if (unlikely(!(_verify3_left == _verify3_right))) \ + if (unlikely(!(0 == _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(0 == " #RIGHT ") " \ - "failed (0 == %lld) " STR "\n", \ - (long long) (_verify3_right), \ + "VERIFY0(" #RIGHT ") failed (%lld) " STR "\n", \ + (long long)_verify3_right, \ __VA_ARGS__); \ } while (0) @@ -260,10 +254,7 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_assert("(" #A ") implies (" #B ")", \ __FILE__, __FUNCTION__, __LINE__))) -#define VERIFY_EQUIV(A, B) \ - ((void)(likely(!!(A) == !!(B)) || \ - spl_assert("(" #A ") is equivalent to (" #B ")", \ - __FILE__, __FUNCTION__, __LINE__))) +#define VERIFY_EQUIV(A, B) VERIFY3B(A, ==, B) /* * Debugging disabled (--disable-debug) diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h index 995236117dd4..fe34de9c179e 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h @@ -61,7 +61,7 @@ void *spl_kvmalloc(size_t size, gfp_t flags); /* * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion * function is context aware which means that KM_SLEEP allocations can be - * safely used in syncing contexts which have set PF_FSTRANS. + * safely used in syncing contexts which have set SPL_FSTRANS. */ static inline gfp_t kmem_flags_convert(int flags) @@ -91,25 +91,11 @@ typedef struct { } fstrans_cookie_t; /* - * Introduced in Linux 3.9, however this cannot be solely relied on before - * Linux 3.18 as it doesn't turn off __GFP_FS as it should. + * SPL_FSTRANS is the set of flags that indicate that the task is in a + * filesystem or IO codepath, and so any allocation must not call back into + * those codepaths (eg to swap). */ -#ifdef PF_MEMALLOC_NOIO -#define __SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO) -#else -#define __SPL_PF_MEMALLOC_NOIO (0) -#endif - -/* - * PF_FSTRANS is removed from Linux 4.12 - */ -#ifdef PF_FSTRANS -#define __SPL_PF_FSTRANS (PF_FSTRANS) -#else -#define __SPL_PF_FSTRANS (0) -#endif - -#define SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO) +#define SPL_FSTRANS (PF_MEMALLOC_NOIO) static inline fstrans_cookie_t spl_fstrans_mark(void) @@ -141,43 +127,8 @@ spl_fstrans_check(void) return (current->flags & SPL_FSTRANS); } -/* - * specifically used to check PF_FSTRANS flag, cannot be relied on for - * checking spl_fstrans_mark(). - */ -static inline int -__spl_pf_fstrans_check(void) -{ - return (current->flags & __SPL_PF_FSTRANS); -} - -/* - * Kernel compatibility for GFP flags - */ -/* < 4.13 */ -#ifndef __GFP_RETRY_MAYFAIL -#define __GFP_RETRY_MAYFAIL __GFP_REPEAT -#endif -/* < 4.4 */ -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - -#ifdef HAVE_ATOMIC64_T -#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) -#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) -#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) -#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) extern atomic64_t kmem_alloc_used; -extern unsigned long long kmem_alloc_max; -#else /* HAVE_ATOMIC64_T */ -#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) -#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) -#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) -#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) -extern atomic_t kmem_alloc_used; -extern unsigned long long kmem_alloc_max; -#endif /* HAVE_ATOMIC64_T */ +extern uint64_t kmem_alloc_max; extern unsigned int spl_kmem_alloc_warn; extern unsigned int spl_kmem_alloc_max; diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h b/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h index 0b44786f8a6e..fbaaf229bd1a 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h @@ -24,7 +24,13 @@ #define _OS_LINUX_SPL_MISC_H #include <linux/kobject.h> +#include <linux/swap.h> extern void spl_signal_kobj_evt(struct block_device *bdev); +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/mod_os.h b/sys/contrib/openzfs/include/os/linux/spl/sys/mod.h index eaeb9255039e..eaeb9255039e 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/mod_os.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/mod.h diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h index f000f53ab9b6..4eca2414fc5b 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h @@ -111,7 +111,7 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ #undef mutex_destroy #define mutex_destroy(mp) \ { \ - VERIFY3P(mutex_owner(mp), ==, NULL); \ + VERIFY0P(mutex_owner(mp)); \ } #define mutex_tryenter(mp) \ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h b/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h index 563e0a19663d..c883836c2f83 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h @@ -130,7 +130,7 @@ RW_READ_HELD(krwlock_t *rwp) /* * The Linux rwsem implementation does not require a matching destroy. */ -#define rw_destroy(rwp) ((void) 0) +#define rw_destroy(rwp) ASSERT(!(RW_LOCK_HELD(rwp))) /* * Upgrading a rwsem from a reader to a writer is not supported by the diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h b/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h index 087389b57b34..ad2815e46394 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h @@ -25,6 +25,6 @@ #ifndef _SPL_STAT_H #define _SPL_STAT_H -#include <linux/stat.h> +#include <sys/stat.h> #endif /* SPL_STAT_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h index e932ea72f1be..db48222b712a 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h @@ -92,8 +92,10 @@ * Treat shim tasks as SCHED_NORMAL tasks */ #define minclsyspri (MAX_PRIO-1) -#define maxclsyspri (MAX_RT_PRIO) #define defclsyspri (DEFAULT_PRIO) +/* Write issue taskq priority. */ +#define wtqclsyspri (MAX_RT_PRIO + 1) +#define maxclsyspri (MAX_RT_PRIO) #ifndef NICE_TO_PRIO #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/time.h b/sys/contrib/openzfs/include/os/linux/spl/sys/time.h index 33b273b53996..4edc42a8aef9 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/time.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/time.h @@ -80,6 +80,14 @@ gethrestime_sec(void) } static inline hrtime_t +getlrtime(void) +{ + inode_timespec_t ts; + ktime_get_coarse_ts64(&ts); + return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec); +} + +static inline hrtime_t gethrtime(void) { struct timespec64 ts; diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h index f66da5d5af57..26c2c387caa3 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h @@ -174,7 +174,7 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) static inline void zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, - ssize_t resid, size_t skip) + ssize_t resid) { uio->uio_iter = iter; uio->uio_iovcnt = iter->nr_segs; @@ -184,7 +184,7 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = resid; - uio->uio_skip = skip; + uio->uio_skip = 0; uio->uio_soffset = uio->uio_loffset; memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h index 77d0cdef5d2f..8fa6ab01d1ad 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h @@ -52,7 +52,6 @@ int secpolicy_vnode_setids_setgids(const cred_t *, gid_t, zidmap_t *, struct user_namespace *); int secpolicy_zinject(const cred_t *); int secpolicy_zfs(const cred_t *); -int secpolicy_zfs_proc(const cred_t *, proc_t *); void secpolicy_setid_clear(vattr_t *, cred_t *); int secpolicy_setid_setsticky_clear(struct inode *, vattr_t *, const vattr_t *, cred_t *, zidmap_t *, struct user_namespace *); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h index 8923657daf02..d88b4937ef08 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h @@ -59,8 +59,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(uint64_t, z_size) __field(uint64_t, z_pflags) __field(uint32_t, z_sync_cnt) - __field(uint32_t, z_sync_writes_cnt) - __field(uint32_t, z_async_writes_cnt) __field(mode_t, z_mode) __field(boolean_t, z_is_sa) __field(boolean_t, z_is_ctldir) @@ -92,8 +90,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_size = zn->z_size; __entry->z_pflags = zn->z_pflags; __entry->z_sync_cnt = zn->z_sync_cnt; - __entry->z_sync_writes_cnt = zn->z_sync_writes_cnt; - __entry->z_async_writes_cnt = zn->z_async_writes_cnt; __entry->z_mode = zn->z_mode; __entry->z_is_sa = zn->z_is_sa; __entry->z_is_ctldir = zn->z_is_ctldir; @@ -117,7 +113,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, TP_printk("zn { id %llu unlinked %u atime_dirty %u " "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " - "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u " + "sync_cnt %u " "mode 0x%x is_sa %d is_ctldir %d " "inode { uid %u gid %u ino %lu nlink %u size %lli " "blkbits %u bytes %u mode 0x%x generation %x } } " @@ -126,7 +122,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_zn_prefetch, __entry->z_blksz, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, __entry->z_pflags, __entry->z_sync_cnt, - __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt, __entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir, __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink, __entry->i_size, __entry->i_blkbits, diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h index 85cf8cc20b09..e1b6d61099b9 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h @@ -45,7 +45,7 @@ __field(zio_flag_t, zio_orig_flags) \ __field(enum zio_stage, zio_orig_stage) \ __field(enum zio_stage, zio_orig_pipeline) \ - __field(uint8_t, zio_reexecute) \ + __field(uint8_t, zio_post) \ __field(uint64_t, zio_txg) \ __field(int, zio_error) \ __field(uint64_t, zio_ena) \ @@ -74,7 +74,7 @@ __entry->zio_orig_flags = zio->io_orig_flags; \ __entry->zio_orig_stage = zio->io_orig_stage; \ __entry->zio_orig_pipeline = zio->io_orig_pipeline; \ - __entry->zio_reexecute = zio->io_reexecute; \ + __entry->zio_post = zio->io_post; \ __entry->zio_txg = zio->io_txg; \ __entry->zio_error = zio->io_error; \ __entry->zio_ena = zio->io_ena; \ @@ -92,7 +92,7 @@ "zio { type %u prio %u size %llu orig_size %llu " \ "offset %llu timestamp %llu delta %llu delay %llu " \ "flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx " \ - "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ + "orig_stage 0x%x orig_pipeline 0x%x post %u " \ "txg %llu error %d ena %llu prop { checksum %u compress %u " \ "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" @@ -102,7 +102,7 @@ __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ __entry->zio_orig_flags, __entry->zio_orig_stage, \ - __entry->zio_orig_pipeline, __entry->zio_reexecute, \ + __entry->zio_orig_pipeline, __entry->zio_post, \ __entry->zio_txg, __entry->zio_error, __entry->zio_ena, \ __entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \ __entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h index 955462c85d10..e34ea46b3fe8 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h @@ -139,18 +139,18 @@ #define ZCW_TP_STRUCT_ENTRY \ __field(lwb_t *, zcw_lwb) \ __field(boolean_t, zcw_done) \ - __field(int, zcw_zio_error) \ + __field(int, zcw_error) \ #define ZCW_TP_FAST_ASSIGN \ __entry->zcw_lwb = zcw->zcw_lwb; \ __entry->zcw_done = zcw->zcw_done; \ - __entry->zcw_zio_error = zcw->zcw_zio_error; + __entry->zcw_error = zcw->zcw_error; #define ZCW_TP_PRINTK_FMT \ "zcw { lwb %p done %u error %u }" #define ZCW_TP_PRINTK_ARGS \ - __entry->zcw_lwb, __entry->zcw_done, __entry->zcw_zio_error + __entry->zcw_lwb, __entry->zcw_done, __entry->zcw_error /* * Generic support for two argument tracepoints of the form: diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h index 4a73712e959d..ab46d5f8ca08 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -131,6 +131,12 @@ struct zfsvfs { uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; + uint64_t z_defaultuserquota; + uint64_t z_defaultgroupquota; + uint64_t z_defaultprojectquota; + uint64_t z_defaultuserobjquota; + uint64_t z_defaultgroupobjquota; + uint64_t z_defaultprojectobjquota; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ uint64_t z_hold_size; /* znode hold array size */ @@ -250,6 +256,8 @@ extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects); extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); +extern int zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t zfs_prop, + uint64_t quota); #ifdef __cplusplus } diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h index b38847b20462..6a77e40abe10 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -157,6 +157,7 @@ struct znode; extern int zfs_sync(struct super_block *, int, cred_t *); extern int zfs_inode_alloc(struct super_block *, struct inode **ip); +extern void zfs_inode_free(struct inode *); extern void zfs_inode_destroy(struct inode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h index 39f1310aadf2..8994aab889fe 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h @@ -55,6 +55,7 @@ extern const struct file_operations zpl_dir_file_operations; extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; +extern const struct dentry_operations zpl_dentry_operations; extern const struct export_operations zpl_export_operations; extern struct file_system_type zpl_fs_type; @@ -123,41 +124,6 @@ extern int zpl_clone_file_range(struct file *src_file, loff_t src_off, extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len); -/* compat for FICLONE/FICLONERANGE/FIDEDUPERANGE ioctls */ -typedef struct { - int64_t fcr_src_fd; - uint64_t fcr_src_offset; - uint64_t fcr_src_length; - uint64_t fcr_dest_offset; -} zfs_ioc_compat_file_clone_range_t; - -typedef struct { - int64_t fdri_dest_fd; - uint64_t fdri_dest_offset; - uint64_t fdri_bytes_deduped; - int32_t fdri_status; - uint32_t fdri_reserved; -} zfs_ioc_compat_dedupe_range_info_t; - -typedef struct { - uint64_t fdr_src_offset; - uint64_t fdr_src_length; - uint16_t fdr_dest_count; - uint16_t fdr_reserved1; - uint32_t fdr_reserved2; - zfs_ioc_compat_dedupe_range_info_t fdr_info[]; -} zfs_ioc_compat_dedupe_range_t; - -#define ZFS_IOC_COMPAT_FICLONE _IOW(0x94, 9, int) -#define ZFS_IOC_COMPAT_FICLONERANGE \ - _IOW(0x94, 13, zfs_ioc_compat_file_clone_range_t) -#define ZFS_IOC_COMPAT_FIDEDUPERANGE \ - _IOWR(0x94, 54, zfs_ioc_compat_dedupe_range_t) - -extern long zpl_ioctl_ficlone(struct file *filp, void *arg); -extern long zpl_ioctl_ficlonerange(struct file *filp, void *arg); -extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); - #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) #define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index 1b30389107c5..b55d5da3378c 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -954,7 +954,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - wmsum_t arcstat_dnode_size; + aggsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; diff --git a/sys/contrib/openzfs/include/sys/dbuf.h b/sys/contrib/openzfs/include/sys/dbuf.h index 285e02484c57..baf3b1508335 100644 --- a/sys/contrib/openzfs/include/sys/dbuf.h +++ b/sys/contrib/openzfs/include/sys/dbuf.h @@ -46,20 +46,6 @@ extern "C" { #define IN_DMU_SYNC 2 /* - * define flags for dbuf_read - */ - -#define DB_RF_MUST_SUCCEED (1 << 0) -#define DB_RF_CANFAIL (1 << 1) -#define DB_RF_HAVESTRUCT (1 << 2) -#define DB_RF_NOPREFETCH (1 << 3) -#define DB_RF_NEVERWAIT (1 << 4) -#define DB_RF_CACHED (1 << 5) -#define DB_RF_NO_DECRYPT (1 << 6) -#define DB_RF_PARTIAL_FIRST (1 << 7) -#define DB_RF_PARTIAL_MORE (1 << 8) - -/* * The simplified state transition diagram for dbufs looks like: * * +-------> READ ------+ @@ -178,6 +164,7 @@ typedef struct dbuf_dirty_record { boolean_t dr_nopwrite; boolean_t dr_brtwrite; boolean_t dr_diowrite; + boolean_t dr_rewrite; boolean_t dr_has_raw_params; /* Override and raw params are mutually exclusive. */ @@ -389,19 +376,21 @@ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); -int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, dmu_flags_t flags); void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); +void dmu_buf_will_fill_flags(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail, + dmu_flags_t flags); boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); -void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); +void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx, + dmu_flags_t flags); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); int dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp); int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); -arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); @@ -447,6 +436,7 @@ int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, void dbuf_init(void); void dbuf_fini(void); +void dbuf_cache_reduce_target_size(void); boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); @@ -476,10 +466,10 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) -#define DBUF_IS_CACHEABLE(_db) \ +#define DBUF_IS_CACHEABLE(_db) (!(_db)->db_pending_evict && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))) boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp); diff --git a/sys/contrib/openzfs/include/sys/ddt.h b/sys/contrib/openzfs/include/sys/ddt.h index 8f2cc9c5a99b..f1687d471a0a 100644 --- a/sys/contrib/openzfs/include/sys/ddt.h +++ b/sys/contrib/openzfs/include/sys/ddt.h @@ -339,6 +339,8 @@ extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp); +extern void ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig, + ddt_phys_variant_t v); extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, ddt_phys_variant_t v); extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); @@ -350,6 +352,8 @@ extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp); extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern int ddt_phys_is_gang(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, boolean_t encrypted); diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index da1fdfd23962..aa5035862def 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -144,9 +144,9 @@ typedef enum dmu_object_byteswap { #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) -#define DMU_OT_IS_CRITICAL(ot) \ +#define DMU_OT_IS_CRITICAL(ot, level) \ (DMU_OT_IS_METADATA(ot) && \ - (ot) != DMU_OT_DNODE && \ + ((ot) != DMU_OT_DNODE || (level) > 0) && \ (ot) != DMU_OT_DIRECTORY_CONTENTS && \ (ot) != DMU_OT_SA) @@ -281,9 +281,30 @@ typedef enum dmu_object_type { * the transaction is full. See the comment above dmu_tx_assign() for more * details on the meaning of these flags. */ -#define DMU_TX_NOWAIT (0ULL) -#define DMU_TX_WAIT (1ULL<<0) -#define DMU_TX_NOTHROTTLE (1ULL<<1) +typedef enum { + /* + * If the tx cannot be assigned to a transaction for any reason, do + * not block but return immediately. + */ + DMU_TX_NOWAIT = 0, + + /* + * Assign the tx to the open transaction. If the open transaction is + * full, or the write throttle is active, block until the next + * transaction and try again. If the pool suspends while waiting + * and failmode=continue, return an error. + */ + DMU_TX_WAIT = (1 << 0), + + /* If the write throttle would prevent the assignment, ignore it. */ + DMU_TX_NOTHROTTLE = (1 << 1), + + /* + * With DMU_TX_WAIT, always block if the pool suspends during + * assignment, regardless of the value of the failmode= property. + */ + DMU_TX_SUSPEND = (1 << 2), +} dmu_tx_flag_t; void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); @@ -339,7 +360,6 @@ void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func, void *arg); -int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); @@ -394,6 +414,9 @@ typedef struct dmu_buf { #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" #define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" +#define DMU_POOL_TXG_LOG_TIME_MINUTES "com.klarasystems:txg_log_time:minutes" +#define DMU_POOL_TXG_LOG_TIME_DAYS "com.klarasystems:txg_log_time:days" +#define DMU_POOL_TXG_LOG_TIME_MONTHS "com.klarasystems:txg_log_time:months" /* * Allocate an object from this objset. The range of object numbers @@ -533,6 +556,26 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); /* + * DB_RF_* are to be used for dbuf_read() or in limited other cases. + */ +typedef enum dmu_flags { + DB_RF_MUST_SUCCEED = 0, /* Suspend on I/O errors. */ + DB_RF_CANFAIL = 1 << 0, /* Return on I/O errors. */ + DB_RF_HAVESTRUCT = 1 << 1, /* dn_struct_rwlock is locked. */ + DB_RF_NEVERWAIT = 1 << 2, + DMU_READ_PREFETCH = 0, /* Try speculative prefetch. */ + DMU_READ_NO_PREFETCH = 1 << 3, /* Don't prefetch speculatively. */ + DB_RF_NOPREFETCH = DMU_READ_NO_PREFETCH, + DMU_READ_NO_DECRYPT = 1 << 4, /* Don't decrypt. */ + DB_RF_NO_DECRYPT = DMU_READ_NO_DECRYPT, + DMU_DIRECTIO = 1 << 5, /* Bypass ARC. */ + DMU_UNCACHEDIO = 1 << 6, /* Reduce caching. */ + DMU_PARTIAL_FIRST = 1 << 7, /* First partial access. */ + DMU_PARTIAL_MORE = 1 << 8, /* Following partial access. */ + DMU_KEEP_CACHING = 1 << 9, /* Don't affect caching. */ +} dmu_flags_t; + +/* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus @@ -547,7 +590,7 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp); int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, - uint32_t flags); + dmu_flags_t flags); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); @@ -558,9 +601,9 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); * Special spill buffer support used by "SA" framework */ -int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, - dmu_buf_t **dbp); -int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags, + const void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags, const void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); @@ -579,17 +622,17 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, - const void *tag, dmu_buf_t **, int flags); + const void *tag, dmu_buf_t **, dmu_flags_t flags); int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp); int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, - const void *tag, dmu_buf_t **dbp, int flags); + const void *tag, dmu_buf_t **dbp, dmu_flags_t flags); int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, - dmu_buf_t ***dbpp, uint32_t flags); + dmu_buf_t ***dbpp, dmu_flags_t flags); int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp); @@ -699,8 +742,8 @@ dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp __maybe_unused) { - ASSERT(dbu->dbu_evict_func_sync == NULL); - ASSERT(dbu->dbu_evict_func_async == NULL); + ASSERT0P(dbu->dbu_evict_func_sync); + ASSERT0P(dbu->dbu_evict_func_async); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); @@ -781,6 +824,8 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags); +void dmu_buf_will_rewrite(dmu_buf_t *db, dmu_tx_t *tx); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); @@ -814,7 +859,7 @@ void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, - int len); + uint64_t len, uint_t blksz); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, @@ -828,7 +873,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t flags); +int dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); @@ -874,40 +919,36 @@ int dmu_free_long_object(objset_t *os, uint64_t object); * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ -#define DMU_READ_PREFETCH 0 /* prefetch */ -#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ -#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ -#define DMU_DIRECTIO 4 /* use Direct I/O */ - int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags); + void *buf, dmu_flags_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, - uint32_t flags); + dmu_flags_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx, uint32_t flags); + const void *buf, dmu_tx_t *tx, dmu_flags_t flags); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL -int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); -int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); -int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size); +int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags); +int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags); +int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, + dmu_flags_t flags); int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_flags_t flags); int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_flags_t flags); int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, dmu_flags_t flags); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, - struct arc_buf *buf, dmu_tx_t *tx); + struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags); int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, - struct arc_buf *buf, dmu_tx_t *tx); + struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf extern uint_t zfs_max_recordsize; @@ -980,6 +1021,11 @@ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); +typedef enum { + DDS_FLAG_ENCRYPTED = (1<<0), + DDS_FLAG_HAS_ENCRYPTED = (1<<7), +} dmu_objset_flag_t; + typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; @@ -989,6 +1035,7 @@ typedef struct dmu_objset_stats { uint8_t dds_inconsistent; uint8_t dds_redacted; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; + uint8_t dds_flags; /* dmu_objset_flag_t */ } dmu_objset_stats_t; /* diff --git a/sys/contrib/openzfs/include/sys/dmu_impl.h b/sys/contrib/openzfs/include/sys/dmu_impl.h index dc2b66d06e7c..bae872bd1907 100644 --- a/sys/contrib/openzfs/include/sys/dmu_impl.h +++ b/sys/contrib/openzfs/include/sys/dmu_impl.h @@ -168,12 +168,10 @@ extern "C" { * dn_allocated_txg * dn_free_txg * dn_assigned_txg - * dn_dirty_txg + * dn_dirtycnt * dd_assigned_tx * dn_notxholds * dn_nodnholds - * dn_dirtyctx - * dn_dirtyctx_firstset * (dn_phys copy fields?) * (dn_phys contents?) * held from: @@ -270,11 +268,13 @@ void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *); -int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags); -int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *); +int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t); +int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, dmu_flags_t, + dmu_tx_t *); #if defined(_KERNEL) -int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t); -int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *); +int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t); +int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_flags_t, + dmu_tx_t *); #endif #ifdef __cplusplus diff --git a/sys/contrib/openzfs/include/sys/dmu_objset.h b/sys/contrib/openzfs/include/sys/dmu_objset.h index 288ad30166df..492be29200e4 100644 --- a/sys/contrib/openzfs/include/sys/dmu_objset.h +++ b/sys/contrib/openzfs/include/sys/dmu_objset.h @@ -152,7 +152,7 @@ struct objset { * The largest zpl file block allowed in special class. * cached here instead of zfsvfs for easier access. */ - int os_zpl_special_smallblock; + uint64_t os_zpl_special_smallblock; /* * Pointer is constant; the blkptr it points to is protected by diff --git a/sys/contrib/openzfs/include/sys/dmu_recv.h b/sys/contrib/openzfs/include/sys/dmu_recv.h index cd292d9244b0..ffb2b602d73f 100644 --- a/sys/contrib/openzfs/include/sys/dmu_recv.h +++ b/sys/contrib/openzfs/include/sys/dmu_recv.h @@ -60,7 +60,6 @@ typedef struct dmu_recv_cookie { uint64_t drc_ivset_guid; void *drc_owner; cred_t *drc_cred; - proc_t *drc_proc; nvlist_t *drc_begin_nvl; objset_t *drc_os; diff --git a/sys/contrib/openzfs/include/sys/dmu_traverse.h b/sys/contrib/openzfs/include/sys/dmu_traverse.h index 3196b2addeee..70cafa4c74f1 100644 --- a/sys/contrib/openzfs/include/sys/dmu_traverse.h +++ b/sys/contrib/openzfs/include/sys/dmu_traverse.h @@ -59,6 +59,13 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ #define TRAVERSE_NO_DECRYPT (1<<5) +/* + * Always use logical birth time for birth time comparisons. This is useful + * for operations that care about user data changes rather than physical + * block rewrites (e.g., incremental replication). + */ +#define TRAVERSE_LOGICAL (1<<6) + /* Special traverse error return value to indicate skipping of children */ #define TRAVERSE_VISIT_NO_CHILDREN -1 diff --git a/sys/contrib/openzfs/include/sys/dmu_tx.h b/sys/contrib/openzfs/include/sys/dmu_tx.h index b87836ecc2d9..ce49a0c49044 100644 --- a/sys/contrib/openzfs/include/sys/dmu_tx.h +++ b/sys/contrib/openzfs/include/sys/dmu_tx.h @@ -25,6 +25,7 @@ */ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #ifndef _SYS_DMU_TX_H @@ -80,6 +81,9 @@ struct dmu_tx { /* has this transaction already been delayed? */ boolean_t tx_dirty_delayed; + /* whether dmu_tx_wait() should return on suspend */ + boolean_t tx_break_on_suspend; + int tx_err; }; @@ -143,7 +147,7 @@ extern dmu_tx_stats_t dmu_tx_stats; * These routines are defined in dmu.h, and are called by the user. */ dmu_tx_t *dmu_tx_create(objset_t *dd); -int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +int dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); diff --git a/sys/contrib/openzfs/include/sys/dmu_zfetch.h b/sys/contrib/openzfs/include/sys/dmu_zfetch.h index 963e841a4882..a5ddd28026ce 100644 --- a/sys/contrib/openzfs/include/sys/dmu_zfetch.h +++ b/sys/contrib/openzfs/include/sys/dmu_zfetch.h @@ -81,9 +81,10 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t); -void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, +void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t, boolean_t); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t, + boolean_t, boolean_t); #ifdef __cplusplus diff --git a/sys/contrib/openzfs/include/sys/dnode.h b/sys/contrib/openzfs/include/sys/dnode.h index 76218c8b09ca..8bd1db5b7165 100644 --- a/sys/contrib/openzfs/include/sys/dnode.h +++ b/sys/contrib/openzfs/include/sys/dnode.h @@ -141,12 +141,6 @@ struct dmu_buf_impl; struct objset; struct zio; -enum dnode_dirtycontext { - DN_UNDIRTIED, - DN_DIRTY_OPEN, - DN_DIRTY_SYNC -}; - /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ #define DNODE_FLAG_USED_BYTES (1 << 0) #define DNODE_FLAG_USERUSED_ACCOUNTED (1 << 1) @@ -340,11 +334,9 @@ struct dnode { uint64_t dn_allocated_txg; uint64_t dn_free_txg; uint64_t dn_assigned_txg; - uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ + uint8_t dn_dirtycnt; kcondvar_t dn_notxholds; kcondvar_t dn_nodnholds; - enum dnode_dirtycontext dn_dirtyctx; - const void *dn_dirtyctx_firstset; /* dbg: contents meaningless */ /* protected by own devices */ zfs_refcount_t dn_tx_holds; @@ -440,7 +432,6 @@ void dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting); int dnode_try_claim(objset_t *os, uint64_t object, int slots); boolean_t dnode_is_dirty(dnode_t *dn); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); -void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); @@ -468,9 +459,6 @@ void dnode_free_interior_slots(dnode_t *dn); void dnode_set_storage_type(dnode_t *dn, dmu_object_type_t type); -#define DNODE_IS_DIRTY(_dn) \ - ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) - #define DNODE_LEVEL_IS_CACHEABLE(_dn, _level) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (((_level) > 0 || DMU_OT_IS_METADATA((_dn)->dn_type)) && \ diff --git a/sys/contrib/openzfs/include/sys/dsl_dataset.h b/sys/contrib/openzfs/include/sys/dsl_dataset.h index 624f3ddde9f0..2e1f9847f34c 100644 --- a/sys/contrib/openzfs/include/sys/dsl_dataset.h +++ b/sys/contrib/openzfs/include/sys/dsl_dataset.h @@ -276,6 +276,12 @@ dsl_dataset_phys(dsl_dataset_t *ds) return ((dsl_dataset_phys_t *)ds->ds_dbuf->db_data); } +typedef struct dsl_dataset_clone_arg_t { + const char *ddca_clone; + const char *ddca_origin; + cred_t *ddca_cred; +} dsl_dataset_clone_arg_t; + typedef struct dsl_dataset_promote_arg { const char *ddpa_clonename; dsl_dataset_t *ddpa_clone; @@ -284,7 +290,6 @@ typedef struct dsl_dataset_promote_arg { uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; nvlist_t *err_ds; cred_t *cr; - proc_t *proc; } dsl_dataset_promote_arg_t; typedef struct dsl_dataset_rollback_arg { @@ -299,7 +304,6 @@ typedef struct dsl_dataset_snapshot_arg { nvlist_t *ddsa_props; nvlist_t *ddsa_errors; cred_t *ddsa_cr; - proc_t *ddsa_proc; } dsl_dataset_snapshot_arg_t; typedef struct dsl_dataset_rename_snapshot_arg { @@ -366,6 +370,9 @@ uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx); int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx); int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); +void dsl_dataset_clone_sync(void *arg, dmu_tx_t *tx); +int dsl_dataset_clone_check(void *arg, dmu_tx_t *tx); +int dsl_dataset_clone(const char *clone, const char *origin); void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx); int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx); int dsl_dataset_promote(const char *name, char *conflsnap); @@ -459,7 +466,7 @@ int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx); int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, - dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc); + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr); void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx); diff --git a/sys/contrib/openzfs/include/sys/dsl_deleg.h b/sys/contrib/openzfs/include/sys/dsl_deleg.h index ae729b9f32ff..36dd6211219d 100644 --- a/sys/contrib/openzfs/include/sys/dsl_deleg.h +++ b/sys/contrib/openzfs/include/sys/dsl_deleg.h @@ -46,6 +46,7 @@ extern "C" { #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_SEND_RAW "send:raw" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_RECEIVE_APPEND "receive:append" #define ZFS_DELEG_PERM_ALLOW "allow" diff --git a/sys/contrib/openzfs/include/sys/dsl_dir.h b/sys/contrib/openzfs/include/sys/dsl_dir.h index 6135835fca48..d2e9e2282975 100644 --- a/sys/contrib/openzfs/include/sys/dsl_dir.h +++ b/sys/contrib/openzfs/include/sys/dsl_dir.h @@ -185,11 +185,11 @@ int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation); int dsl_dir_activate_fs_ss_limit(const char *); int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, - cred_t *, proc_t *); + cred_t *); void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); int dsl_dir_rename(const char *oldname, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, - uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *, proc_t *); + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); diff --git a/sys/contrib/openzfs/include/sys/fm/fs/zfs.h b/sys/contrib/openzfs/include/sys/fm/fs/zfs.h index 30ced55da138..a771b11420fd 100644 --- a/sys/contrib/openzfs/include/sys/fm/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fm/fs/zfs.h @@ -58,6 +58,7 @@ extern "C" { #define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" #define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" #define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" +#define FM_EREPORT_ZFS_SITOUT "sitout" #define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" #define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" @@ -103,6 +104,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY "zio_priority" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TYPE "zio_type" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" diff --git a/sys/contrib/openzfs/include/sys/frame.h b/sys/contrib/openzfs/include/sys/frame.h index dbcf1087bed8..fe1db28b7077 100644 --- a/sys/contrib/openzfs/include/sys/frame.h +++ b/sys/contrib/openzfs/include/sys/frame.h @@ -31,8 +31,16 @@ extern "C" { #else #include <linux/frame.h> #endif +#if defined(_ASM) && ! defined(HAVE_STACK_FRAME_NON_STANDARD_ASM) +.macro STACK_FRAME_NON_STANDARD func:req +.endm +#endif #else #define STACK_FRAME_NON_STANDARD(func) +#if defined(_ASM) +.macro STACK_FRAME_NON_STANDARD func:req +.endm +#endif #endif #ifdef __cplusplus diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h index 2d27aee217e0..662fd81c5ee1 100644 --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -197,6 +197,12 @@ typedef enum { ZFS_PROP_VOLTHREADING, ZFS_PROP_DIRECT, ZFS_PROP_LONGNAME, + ZFS_PROP_DEFAULTUSERQUOTA, + ZFS_PROP_DEFAULTGROUPQUOTA, + ZFS_PROP_DEFAULTPROJECTQUOTA, + ZFS_PROP_DEFAULTUSEROBJQUOTA, + ZFS_PROP_DEFAULTGROUPOBJQUOTA, + ZFS_PROP_DEFAULTPROJECTOBJQUOTA, ZFS_NUM_PROPS } zfs_prop_t; @@ -379,6 +385,8 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_SIT_OUT, + VDEV_PROP_AUTOSIT, VDEV_NUM_PROPS } vdev_prop_t; @@ -740,6 +748,8 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_MIN_ALLOC "min_alloc" +#define ZPOOL_CONFIG_MAX_ALLOC "max_alloc" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ @@ -1614,6 +1624,18 @@ typedef enum zfs_ioc { #endif +typedef struct zfs_rewrite_args { + uint64_t off; + uint64_t len; + uint64_t flags; + uint64_t arg; +} zfs_rewrite_args_t; + +/* zfs_rewrite_args flags */ +#define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */ + +#define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t) + /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. @@ -1655,6 +1677,7 @@ typedef enum { ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, ZFS_ERR_STREAM_LARGE_MICROZAP, + ZFS_ERR_TOO_MANY_SITOUTS, } zfs_errno_t; /* diff --git a/sys/contrib/openzfs/include/sys/metaslab.h b/sys/contrib/openzfs/include/sys/metaslab.h index c0844dac9187..36cbe06bacce 100644 --- a/sys/contrib/openzfs/include/sys/metaslab.h +++ b/sys/contrib/openzfs/include/sys/metaslab.h @@ -41,7 +41,7 @@ extern "C" { typedef struct metaslab_ops { const char *msop_name; - uint64_t (*msop_alloc)(metaslab_t *, uint64_t); + uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *); } metaslab_ops_t; @@ -81,9 +81,12 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_ASYNC_ALLOC 0x8 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, - uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *); + uint64_t, const blkptr_t *, int, zio_alloc_list_t *, int, const void *); +int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t, + blkptr_t *, int, uint64_t, const blkptr_t *, int, zio_alloc_list_t *, + int, const void *, uint64_t *); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, - dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); + dva_t *, int, const dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); @@ -95,21 +98,24 @@ void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_stat_init(void); void metaslab_stat_fini(void); +void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); -metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *, - boolean_t); +metaslab_class_t *metaslab_class_create(spa_t *, const char *, + const metaslab_ops_t *, boolean_t); void metaslab_class_destroy(metaslab_class_t *); void metaslab_class_validate(metaslab_class_t *); void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); -boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *, - boolean_t, boolean_t *); -boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, + uint64_t, boolean_t, boolean_t *); +boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, + uint64_t); void metaslab_class_evict_old(metaslab_class_t *, uint64_t); +const char *metaslab_class_get_name(metaslab_class_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *); @@ -118,7 +124,7 @@ uint64_t metaslab_class_get_deferred(metaslab_class_t *); void metaslab_space_update(vdev_t *, metaslab_class_t *, int64_t, int64_t, int64_t); -metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); @@ -127,6 +133,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int, + uint64_t, const void *); void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t, const void *); void metaslab_recalculate_weight_and_sort(metaslab_t *); diff --git a/sys/contrib/openzfs/include/sys/metaslab_impl.h b/sys/contrib/openzfs/include/sys/metaslab_impl.h index 4408dcfddd4a..6ce995d0a086 100644 --- a/sys/contrib/openzfs/include/sys/metaslab_impl.h +++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h @@ -181,7 +181,8 @@ typedef struct metaslab_class_allocator { struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; - const metaslab_ops_t *mc_ops; + const char *mc_name; + const metaslab_ops_t *mc_ops; /* * Track the number of metaslab groups that have been initialized @@ -269,7 +270,6 @@ struct metaslab_group { kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; - int mg_allocators; metaslab_group_allocator_t mg_allocator[]; }; @@ -539,6 +539,8 @@ typedef struct metaslab_unflushed_phys { uint64_t msp_unflushed_txg; } metaslab_unflushed_phys_t; +char *metaslab_rt_name(metaslab_group_t *, metaslab_t *, const char *); + #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/mod.h b/sys/contrib/openzfs/include/sys/mod.h deleted file mode 100644 index 4122889ab758..000000000000 --- a/sys/contrib/openzfs/include/sys/mod.h +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <behlendorf1@llnl.gov>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef _SYS_MOD_H -#define _SYS_MOD_H - -#ifdef _KERNEL -#include <sys/mod_os.h> -#else -/* - * Exported symbols - */ -#define EXPORT_SYMBOL(x) -#endif - -#endif /* SYS_MOD_H */ diff --git a/sys/contrib/openzfs/include/sys/nvpair.h b/sys/contrib/openzfs/include/sys/nvpair.h index 66362a9dbd41..6bec9702eb16 100644 --- a/sys/contrib/openzfs/include/sys/nvpair.h +++ b/sys/contrib/openzfs/include/sys/nvpair.h @@ -267,6 +267,8 @@ _SYS_NVPAIR_H int nvlist_lookup_double(const nvlist_t *, const char *, double *); #endif +_SYS_NVPAIR_H int nvlist_snprintf(char *, size_t, nvlist_t *, int); + _SYS_NVPAIR_H int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); _SYS_NVPAIR_H int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **, int *, const char **); diff --git a/sys/contrib/openzfs/include/sys/range_tree.h b/sys/contrib/openzfs/include/sys/range_tree.h index 23e80f64284b..0f6def36f9f6 100644 --- a/sys/contrib/openzfs/include/sys/range_tree.h +++ b/sys/contrib/openzfs/include/sys/range_tree.h @@ -49,6 +49,9 @@ typedef enum zfs_range_seg_type { ZFS_RANGE_SEG_NUM_TYPES, } zfs_range_seg_type_t; +#define ZFS_RT_NAME(rt) (((rt)->rt_name != NULL) ? (rt)->rt_name : "") +#define ZFS_RT_F_DYN_NAME (1ULL << 0) /* if rt_name must be freed */ + /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. @@ -68,6 +71,9 @@ typedef struct zfs_range_tree { void *rt_arg; uint64_t rt_gap; /* allowable inter-segment gap */ + uint64_t rt_flags; + const char *rt_name; /* details for debugging */ + /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: @@ -232,8 +238,7 @@ zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) } static inline void -zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, - uint64_t fill) +zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { @@ -271,7 +276,7 @@ static inline void zfs_rs_set_fill(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); - zfs_zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); + zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); } typedef void zfs_range_tree_func_t(void *arg, uint64_t start, uint64_t size); @@ -281,6 +286,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, uint64_t gap); zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); +zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t flags, const char *name); void zfs_range_tree_destroy(zfs_range_tree_t *rt); boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size); diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index 439cd461b710..f172f2af6f07 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -140,7 +140,7 @@ typedef struct zio_cksum_salt { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | + * 7 |R| padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -175,6 +175,7 @@ typedef struct zio_cksum_salt { * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type + * R rewrite (reallocated/rewritten at phys birth TXG) * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by @@ -190,11 +191,11 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | pad | ASIZE | + * 0 | pad | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | pad | ASIZE | + * 2 | pad | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -204,7 +205,7 @@ typedef struct zio_cksum_salt { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | + * 7 |R| padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -373,7 +374,8 @@ typedef enum bp_embedded_type { typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_prop2; /* additional properties */ + uint64_t blk_pad; /* Extra space for the future */ uint64_t blk_birth_word[2]; uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ @@ -476,32 +478,51 @@ typedef struct blkptr { #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) +/* + * Block birth time macros for different use cases: + * - BP_GET_LOGICAL_BIRTH(): When the block was logically modified by user. + * To be used with a focus on user data, like incremental replication. + * - BP_GET_PHYSICAL_BIRTH(): When the block was physically written to disks. + * For regular writes is equal to logical birth. For dedup and block cloning + * can be smaller than logical birth. For remapped and rewritten blocks can + * be bigger. To be used with focus on physical disk content: ARC, DDT, scrub. + * - BP_GET_RAW_PHYSICAL_BIRTH(): Raw physical birth value. Zero if equal + * to logical birth. Should only be used for BP copying and debugging. + * - BP_GET_BIRTH(): When the block was allocated, which is a physical birth + * for rewritten blocks (rewrite flag set) or logical birth otherwise. + */ #define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1] #define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x)) -#define BP_GET_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] +#define BP_GET_RAW_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] #define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x)) -#define BP_GET_BIRTH(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \ +#define BP_GET_PHYSICAL_BIRTH(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + BP_GET_RAW_PHYSICAL_BIRTH(bp) ? BP_GET_RAW_PHYSICAL_BIRTH(bp) : \ BP_GET_LOGICAL_BIRTH(bp)) -#define BP_SET_BIRTH(bp, logical, physical) \ -{ \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BP_SET_LOGICAL_BIRTH(bp, logical); \ - BP_SET_PHYSICAL_BIRTH(bp, \ - ((logical) == (physical) ? 0 : (physical))); \ +#define BP_GET_BIRTH(bp) \ + ((BP_IS_EMBEDDED(bp) || !BP_GET_REWRITE(bp)) ? \ + BP_GET_LOGICAL_BIRTH(bp) : BP_GET_PHYSICAL_BIRTH(bp)) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BP_SET_LOGICAL_BIRTH(bp, logical); \ + BP_SET_PHYSICAL_BIRTH(bp, \ + ((logical) == (physical) ? 0 : (physical))); \ } #define BP_GET_FILL(bp) \ - ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ - ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) + (BP_IS_EMBEDDED(bp) ? 1 : \ + BP_IS_ENCRYPTED(bp) ? BF64_GET((bp)->blk_fill, 0, 32) : \ + (bp)->blk_fill) #define BP_SET_FILL(bp, fill) \ { \ - if (BP_IS_ENCRYPTED(bp)) \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ @@ -516,6 +537,15 @@ typedef struct blkptr { BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } +#define BP_GET_REWRITE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : BF64_GET((bp)->blk_prop2, 63, 1)) + +#define BP_SET_REWRITE(bp, x) \ +{ \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop2, 63, 1, x); \ +} + #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) @@ -545,7 +575,7 @@ typedef struct blkptr { (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ - (BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \ + (BP_GET_PHYSICAL_BIRTH(bp1) == BP_GET_PHYSICAL_BIRTH(bp2) && \ BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ @@ -588,8 +618,8 @@ typedef struct blkptr { { \ BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ - (bp)->blk_pad[0] = 0; \ - (bp)->blk_pad[1] = 0; \ + (bp)->blk_prop2 = 0; \ + (bp)->blk_pad = 0; \ (bp)->blk_birth_word[0] = 0; \ (bp)->blk_birth_word[1] = 0; \ (bp)->blk_fill = 0; \ @@ -696,7 +726,7 @@ typedef struct blkptr { (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \ - (u_longlong_t)BP_GET_BIRTH(bp), \ + (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ @@ -784,6 +814,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 #define SPA_ASYNC_DETACH_SPARE 0x4000 +#define SPA_ASYNC_REMOVE_BY_USER 0x8000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check); @@ -849,7 +880,6 @@ extern kcondvar_t spa_namespace_cv; #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); -extern void spa_config_load(void); extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, @@ -980,9 +1010,9 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, - uint32_t flags); + dmu_flags_t flags); extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, - uint32_t flags); + dmu_flags_t flags); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, @@ -1000,7 +1030,7 @@ extern void spa_import_progress_set_notes_nolog(spa_t *spa, extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); -extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, +extern void spa_config_enter_priority(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); @@ -1054,6 +1084,7 @@ extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern void spa_get_min_alloc_range(spa_t *spa, uint64_t *min, uint64_t *max); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); @@ -1064,6 +1095,7 @@ extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); +extern metaslab_class_t *spa_special_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio); extern boolean_t spa_special_has_ddt(spa_t *spa); @@ -1115,7 +1147,9 @@ extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_dedup(spa_t *spa); extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_has_special(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); @@ -1179,7 +1213,7 @@ extern void zfs_ereport_taskq_fini(void); extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); -extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_approx_errlog_size(spa_t *spa); @@ -1210,7 +1244,6 @@ extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); -extern void spa_boot_init(void *); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 8c52f751a819..62b062984d36 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -55,6 +55,8 @@ #include <sys/dsl_deadlist.h> #include <zfeature_common.h> +#include "zfs_crrd.h" + #ifdef __cplusplus extern "C" { #endif @@ -246,6 +248,7 @@ struct spa { metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ + metaslab_class_t *spa_special_embedded_log_class; /* log on special */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ @@ -262,6 +265,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_max_alloc; /* of vdevs in normal class */ uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ @@ -343,6 +347,12 @@ struct spa { spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; + kmutex_t spa_txg_log_time_lock; /* for spa_txg_log_time */ + dbrrd_t spa_txg_log_time; + uint64_t spa_last_noted_txg; + uint64_t spa_last_noted_txg_time; + uint64_t spa_last_flush_txg_time; + space_map_t *spa_syncing_log_sm; /* current log space map */ avl_tree_t spa_sm_logs_by_txg; kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ diff --git a/sys/contrib/openzfs/include/sys/txg.h b/sys/contrib/openzfs/include/sys/txg.h index 70ba89c8ac09..eabb6f7aab4e 100644 --- a/sys/contrib/openzfs/include/sys/txg.h +++ b/sys/contrib/openzfs/include/sys/txg.h @@ -25,6 +25,7 @@ */ /* * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #ifndef _SYS_TXG_H @@ -66,6 +67,23 @@ typedef struct txg_list { txg_node_t *tl_head[TXG_SIZE]; } txg_list_t; +/* + * Wait flags for txg_wait_synced_flags(). By default (TXG_WAIT_NONE), it will + * wait until the wanted txg is reached, or block forever. Additional flags + * indicate other conditions that the caller is interested in, that will cause + * the wait to break and return an error code describing the condition. + */ +typedef enum { + /* No special flags. Guaranteed to block forever or return 0 */ + TXG_WAIT_NONE = 0, + + /* If a signal arrives while waiting, abort and return EINTR */ + TXG_WAIT_SIGNAL = (1 << 0), + + /* If the pool suspends while waiting, abort and return ESHUTDOWN. */ + TXG_WAIT_SUSPEND = (1 << 1), +} txg_wait_flag_t; + struct dsl_pool; extern void txg_init(struct dsl_pool *dp, uint64_t txg); @@ -86,13 +104,21 @@ extern void txg_kick(struct dsl_pool *dp, uint64_t txg); * Try to make this happen as soon as possible (eg. kick off any * necessary syncs immediately). If txg==0, wait for the currently open * txg to finish syncing. + * See txg_wait_flag_t above for a description of how the flags affect the wait. + */ +extern int txg_wait_synced_flags(struct dsl_pool *dp, uint64_t txg, + txg_wait_flag_t flags); + +/* + * Traditional form of txg_wait_synced_flags, waits forever. + * Shorthand for VERIFY0(txg_wait_synced_flags(dp, TXG_WAIT_NONE)) */ extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); /* - * Wait as above. Returns true if the thread was signaled while waiting. + * Wake all threads waiting in txg_wait_synced_flags() so they can reevaluate. */ -extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); +extern void txg_wait_kick(struct dsl_pool *dp); /* * Wait until the given transaction group, or one after it, is diff --git a/sys/contrib/openzfs/include/sys/vdev.h b/sys/contrib/openzfs/include/sys/vdev.h index a6a41882d3cf..510474d6c085 100644 --- a/sys/contrib/openzfs/include/sys/vdev.h +++ b/sys/contrib/openzfs/include/sys/vdev.h @@ -100,6 +100,7 @@ extern boolean_t vdev_replace_in_progress(vdev_t *vdev); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); +void vdev_update_nonallocating_space(vdev_t *vd, boolean_t add); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); @@ -133,9 +134,12 @@ extern void vdev_space_update(vdev_t *vd, extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); +extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, + uint64_t txg); extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_get_min_alloc(vdev_t *vd); /* * Return the amount of space allocated for a gang block header. Note that @@ -145,7 +149,20 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); + return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0)); +} + +/* + * Return the amount of data that can be stored in a gang header. Because we + * need to ensure gang headers can always be allocated (as long as there is + * space available), this is the minimum allocatable size on the vdev. Note that + * since the physical birth txg is not provided, this must be constant for + * a given vdev. (e.g. raidz expansion can't change this) + */ +static inline uint64_t +vdev_gang_header_psize(vdev_t *vd) +{ + return (vdev_get_min_alloc(vd)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); diff --git a/sys/contrib/openzfs/include/sys/vdev_draid.h b/sys/contrib/openzfs/include/sys/vdev_draid.h index d44ab6681db9..e923092a39ad 100644 --- a/sys/contrib/openzfs/include/sys/vdev_draid.h +++ b/sys/contrib/openzfs/include/sys/vdev_draid.h @@ -95,7 +95,7 @@ extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **); */ extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); -extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); +extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t, uint64_t); extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *); extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index a2a3e25d14cc..5a8c2f846be2 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -103,7 +103,8 @@ typedef const struct vdev_ops { vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; - vdev_asize_func_t *vdev_op_asize; + vdev_asize_func_t *vdev_op_psize_to_asize; + vdev_asize_func_t *vdev_op_asize_to_psize; vdev_min_asize_func_t *vdev_op_min_asize; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; @@ -278,10 +279,12 @@ struct vdev { uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ + boolean_t vdev_autosit; /* automatic sitout management */ boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ + uint64_t vdev_last_latency_check; /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ @@ -430,6 +433,10 @@ struct vdev { hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ + /* used to calculate average read latency */ + uint64_t *vdev_prev_histo; + int64_t vdev_outlier_count; /* read outlier amongst peers */ + hrtime_t vdev_read_sit_out_expire; /* end of sit out period */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* @@ -615,11 +622,11 @@ extern vdev_ops_t vdev_indirect_ops; */ extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); +extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); -extern uint64_t vdev_get_min_alloc(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); @@ -643,10 +650,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); -#if defined(__linux__) +#if defined(__linux__) && defined(_KERNEL) int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp); #endif int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS); +char *vdev_rt_name(vdev_t *vd, const char *name); /* * Vdev ashift optimization tunables diff --git a/sys/contrib/openzfs/include/sys/vdev_raidz.h b/sys/contrib/openzfs/include/sys/vdev_raidz.h index 3b02728cdbf3..df8c2aed4045 100644 --- a/sys/contrib/openzfs/include/sys/vdev_raidz.h +++ b/sys/contrib/openzfs/include/sys/vdev_raidz.h @@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); struct raidz_row *vdev_raidz_row_alloc(int, zio_t *); void vdev_raidz_reflow_copy_scratch(spa_t *); void raidz_dtl_reassessed(vdev_t *); +boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t); +void vdev_raidz_sit_child(vdev_t *, uint64_t); +void vdev_raidz_unsit_child(vdev_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; diff --git a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h index debce6f09a22..8c8dcfb077f6 100644 --- a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h @@ -119,6 +119,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -133,6 +134,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_outlier_cnt; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/sys/contrib/openzfs/include/sys/xvattr.h b/sys/contrib/openzfs/include/sys/xvattr.h index 447842d269b3..5dadbdb4c619 100644 --- a/sys/contrib/openzfs/include/sys/xvattr.h +++ b/sys/contrib/openzfs/include/sys/xvattr.h @@ -311,6 +311,7 @@ xva_getxoptattr(xvattr_t *xvap) */ #define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ #define V_APPEND 0x2 /* want to do append only check */ +#define V_NAMEDATTR 0x4 /* is a named attribute check */ /* * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations diff --git a/sys/contrib/openzfs/include/sys/zcp.h b/sys/contrib/openzfs/include/sys/zcp.h index 96279deaee75..5fcfb6219870 100644 --- a/sys/contrib/openzfs/include/sys/zcp.h +++ b/sys/contrib/openzfs/include/sys/zcp.h @@ -76,7 +76,6 @@ typedef struct zcp_run_info { * rather than the 'current' thread's. */ cred_t *zri_cred; - proc_t *zri_proc; /* * The tx in which this channel program is running. diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h index 549f54c09383..7112d3ef5c99 100644 --- a/sys/contrib/openzfs/include/sys/zfs_context.h +++ b/sys/contrib/openzfs/include/sys/zfs_context.h @@ -205,18 +205,6 @@ extern void vpanic(const char *, va_list) #define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) /* - * Tunables. - */ -typedef struct zfs_kernel_param { - const char *name; /* unused stub */ -} zfs_kernel_param_t; - -#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) -#define ZFS_MODULE_PARAM_ARGS void -#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ - getfunc, perm, desc) - -/* * Threads. */ typedef pthread_t kthread_t; @@ -236,6 +224,11 @@ typedef pthread_t kthread_t; #define thread_join(t) pthread_join((pthread_t)(t), NULL) #define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) +/* + * Check if the current thread is a memory reclaim thread. + * Always returns false in userspace (no memory reclaim thread). + */ +#define current_is_reclaim_thread() (0) /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { @@ -623,8 +616,10 @@ extern void delay(clock_t ticks); * Process priorities as defined by setpriority(2) and getpriority(2). */ #define minclsyspri 19 -#define maxclsyspri -20 #define defclsyspri 0 +/* Write issue taskq priority. */ +#define wtqclsyspri -19 +#define maxclsyspri -20 #define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) #define CPU_SEQID_UNSTABLE CPU_SEQID @@ -632,6 +627,9 @@ extern void delay(clock_t ticks); #define kcred NULL #define CRED() NULL +#define crhold(cr) ((void)cr) +#define crfree(cr) ((void)cr) + #define ptob(x) ((x) * PAGESIZE) #define NN_DIVISOR_1000 (1U << 0) @@ -668,7 +666,7 @@ extern void random_fini(void); struct spa; extern void show_pool_stats(struct spa *); -extern int set_global_var(char const *arg); +extern int handle_tunable_option(const char *, boolean_t); typedef struct callb_cpr { kmutex_t *cc_lockp; @@ -744,7 +742,6 @@ extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int secpolicy_zfs(const cred_t *cr); -extern int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc); extern zoneid_t getzoneid(void); /* SID stuff */ @@ -774,7 +771,6 @@ typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); extern void spl_fstrans_unmark(fstrans_cookie_t); -extern int __spl_pf_fstrans_check(void); extern int kmem_cache_reap_active(void); diff --git a/sys/contrib/openzfs/include/sys/zfs_debug.h b/sys/contrib/openzfs/include/sys/zfs_debug.h index 871936da15f6..4d4cd4c39e97 100644 --- a/sys/contrib/openzfs/include/sys/zfs_debug.h +++ b/sys/contrib/openzfs/include/sys/zfs_debug.h @@ -39,6 +39,8 @@ extern "C" { #define FALSE 0 #endif +#include <sys/nvpair.h> + extern int zfs_flags; extern int zfs_recover; extern int zfs_free_leak_on_eio; @@ -104,6 +106,24 @@ extern void zfs_panic_recover(const char *fmt, ...); extern void zfs_dbgmsg_init(void); extern void zfs_dbgmsg_fini(void); +/* + * When printing an nvlist, print one beginning line with the file/func/line + * number and the text "nvlist <var name>:" followed by all the nvlist lines + * without the file/fun/line number. This makes the nvlist lines easy to read. + */ +#define zfs_dbgmsg_nvlist(nv) \ + if (zfs_dbgmsg_enable) { \ + zfs_dbgmsg("nvlist "#nv":"); \ + __zfs_dbgmsg_nvlist(nv); \ + } + +#define zfs_dbgmsg(...) \ + if (zfs_dbgmsg_enable) \ + __dprintf(B_FALSE, __FILE__, __func__, __LINE__, __VA_ARGS__) + + +extern void __zfs_dbgmsg_nvlist(nvlist_t *nv); + #ifndef _KERNEL extern int dprintf_find_string(const char *string); extern void zfs_dbgmsg_print(int fd, const char *tag); diff --git a/sys/contrib/openzfs/include/sys/zfs_file.h b/sys/contrib/openzfs/include/sys/zfs_file.h index a1f344c2bb79..67abe9988aaa 100644 --- a/sys/contrib/openzfs/include/sys/zfs_file.h +++ b/sys/contrib/openzfs/include/sys/zfs_file.h @@ -46,7 +46,7 @@ void zfs_file_close(zfs_file_t *fp); int zfs_file_write(zfs_file_t *fp, const void *buf, size_t len, ssize_t *resid); int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t len, loff_t off, - ssize_t *resid); + uint8_t ashift, ssize_t *resid); int zfs_file_read(zfs_file_t *fp, void *buf, size_t len, ssize_t *resid); int zfs_file_pread(zfs_file_t *fp, void *buf, size_t len, loff_t off, ssize_t *resid); diff --git a/sys/contrib/openzfs/include/sys/zfs_ioctl.h b/sys/contrib/openzfs/include/sys/zfs_ioctl.h index 1805028024e6..cfe11f43bb8e 100644 --- a/sys/contrib/openzfs/include/sys/zfs_ioctl.h +++ b/sys/contrib/openzfs/include/sys/zfs_ioctl.h @@ -455,6 +455,7 @@ typedef enum zinject_type { ZINJECT_DECRYPT_FAULT, ZINJECT_DELAY_IMPORT, ZINJECT_DELAY_EXPORT, + ZINJECT_DELAY_READY, } zinject_type_t; typedef enum zinject_iotype { @@ -534,10 +535,22 @@ typedef struct zfs_cmd { zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; - zinject_record_t zc_inject_record; - uint32_t zc_defer_destroy; - uint32_t zc_flags; - uint64_t zc_action_handle; + + /* + * zinject_record_t grew past its original size, which would push out + * the size of zfs_cmd_t. To adjust for this, we allow it to use the + * space after it, since those fields aren't used with ZFS_IOC_INJECT. + */ + union { + zinject_record_t zc_inject_record; + struct { + char zc_pad1[sizeof (zinject_record_t) - 16]; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + }; + }; + int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ @@ -548,6 +561,20 @@ typedef struct zfs_cmd { uint64_t zc_zoneid; } zfs_cmd_t; +/* + * zfs_cmd_t (and by extension, it's member structs) must always be the same + * size. Changing it will break compatibility between the kernel module and the + * userspace tools. + * + * This test is convoluted because MAXPATHLEN and MAXNAMELEN can vary across + * platforms. We include them directly here, which means it won't trip if those + * ever change, but if that happens we likely have other things to worry about. + */ +#define _expected_zfs_cmd_size ((MAXPATHLEN*3)+MAXNAMELEN+1200) +_Static_assert(sizeof (zfs_cmd_t) == _expected_zfs_cmd_size, + "zfs_cmd_t has wrong size"); +#undef _expected_zfs_cmd_size + typedef struct zfs_useracct { char zu_domain[256]; uid_t zu_rid; diff --git a/sys/contrib/openzfs/include/sys/zfs_quota.h b/sys/contrib/openzfs/include/sys/zfs_quota.h index f12a0f2db394..62389cd2f3b2 100644 --- a/sys/contrib/openzfs/include/sys/zfs_quota.h +++ b/sys/contrib/openzfs/include/sys/zfs_quota.h @@ -35,7 +35,7 @@ extern int zpl_get_file_info(dmu_object_type_t, extern int zfs_userspace_one(struct zfsvfs *, zfs_userquota_prop_t, const char *, uint64_t, uint64_t *); extern int zfs_userspace_many(struct zfsvfs *, zfs_userquota_prop_t, - uint64_t *, void *, uint64_t *); + uint64_t *, void *, uint64_t *, uint64_t *); extern int zfs_set_userquota(struct zfsvfs *, zfs_userquota_prop_t, const char *, uint64_t, uint64_t); diff --git a/sys/contrib/openzfs/include/sys/zfs_racct.h b/sys/contrib/openzfs/include/sys/zfs_racct.h index 939e8fa666e9..562029d4114d 100644 --- a/sys/contrib/openzfs/include/sys/zfs_racct.h +++ b/sys/contrib/openzfs/include/sys/zfs_racct.h @@ -33,7 +33,9 @@ /* * Platform-dependent resource accounting hooks */ -void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); -void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); +void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, + dmu_flags_t flags); +void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, + dmu_flags_t flags); #endif /* _SYS_ZFS_RACCT_H */ diff --git a/sys/contrib/openzfs/include/sys/zfs_vfsops.h b/sys/contrib/openzfs/include/sys/zfs_vfsops.h index 18cc31e7183f..8b8f73cf3540 100644 --- a/sys/contrib/openzfs/include/sys/zfs_vfsops.h +++ b/sys/contrib/openzfs/include/sys/zfs_vfsops.h @@ -27,7 +27,7 @@ #ifndef _SYS_ZFS_VFSOPS_H #define _SYS_ZFS_VFSOPS_H -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_WANT_ZNODE) #include <sys/zfs_vfsops_os.h> #endif diff --git a/sys/contrib/openzfs/include/sys/zfs_vnops.h b/sys/contrib/openzfs/include/sys/zfs_vnops.h index 21f0da4fe6b4..08cf0e2a6e48 100644 --- a/sys/contrib/openzfs/include/sys/zfs_vnops.h +++ b/sys/contrib/openzfs/include/sys/zfs_vnops.h @@ -40,6 +40,7 @@ extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *, uint64_t *, cred_t *); extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t, const blkptr_t *, size_t); +extern int zfs_rewrite(znode_t *, uint64_t, uint64_t, uint64_t, uint64_t); extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); diff --git a/sys/contrib/openzfs/include/sys/zfs_znode.h b/sys/contrib/openzfs/include/sys/zfs_znode.h index b3a267e16f3e..79b845a672a8 100644 --- a/sys/contrib/openzfs/include/sys/zfs_znode.h +++ b/sys/contrib/openzfs/include/sys/zfs_znode.h @@ -73,7 +73,7 @@ extern "C" { pflags |= attr; \ else \ pflags &= ~attr; \ - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(ZTOZSB(zp)), \ + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(ZTOZSB(zp)), \ &pflags, sizeof (pflags), tx)); \ } @@ -163,8 +163,9 @@ extern int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir); extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_WANT_ZNODE) #include <sys/zfs_znode_impl.h> +#include <sys/zfs_rlock.h> /* * Directory entry locks control access to directory entries. @@ -201,8 +202,6 @@ typedef struct znode { uint64_t z_size; /* file size (cached) */ uint64_t z_pflags; /* pflags (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ - uint32_t z_sync_writes_cnt; /* synchronous write count */ - uint32_t z_async_writes_cnt; /* asynchronous write count */ mode_t z_mode; /* mode (cached) */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ @@ -219,7 +218,9 @@ typedef struct znode { */ ZNODE_OS_FIELDS; } znode_t; +#endif +#ifdef _KERNEL /* Verifies the znode is valid. */ static inline int zfs_verify_zp(znode_t *zp) diff --git a/sys/contrib/openzfs/include/sys/zil.h b/sys/contrib/openzfs/include/sys/zil.h index fa7945d8ab8b..da085998879b 100644 --- a/sys/contrib/openzfs/include/sys/zil.h +++ b/sys/contrib/openzfs/include/sys/zil.h @@ -456,7 +456,7 @@ typedef enum { WR_NUM_STATES /* number of states */ } itx_wr_state_t; -typedef void (*zil_callback_t)(void *data); +typedef void (*zil_callback_t)(void *data, int err); typedef struct itx { list_node_t itx_node; /* linkage on zl_itx_list */ @@ -498,10 +498,13 @@ typedef struct zil_stats { * (see zil_commit_writer_stall()) * - suspend: ZIL suspended * (see zil_commit(), zil_get_commit_list()) + * - crash: ZIL crashed + * (see zil_crash(), zil_commit(), ...) */ kstat_named_t zil_commit_error_count; kstat_named_t zil_commit_stall_count; kstat_named_t zil_commit_suspend_count; + kstat_named_t zil_commit_crash_count; /* * Number of transactions (reads, writes, renames, etc.) @@ -549,6 +552,7 @@ typedef struct zil_sums { wmsum_t zil_commit_error_count; wmsum_t zil_commit_stall_count; wmsum_t zil_commit_suspend_count; + wmsum_t zil_commit_crash_count; wmsum_t zil_itx_count; wmsum_t zil_itx_indirect_count; wmsum_t zil_itx_indirect_bytes; @@ -577,6 +581,25 @@ typedef struct zil_sums { #define ZIL_STAT_BUMP(zil, stat) \ ZIL_STAT_INCR(zil, stat, 1); +/* + * Flags for zil_commit_flags(). zil_commit() is a shortcut for + * zil_commit_flags(ZIL_COMMIT_FAILMODE), which is the most common use. + */ +typedef enum { + /* + * Try to commit the ZIL. If it fails, fall back to txg_wait_synced(). + * If that fails, return EIO. + */ + ZIL_COMMIT_NOW = 0, + + /* + * Like ZIL_COMMIT_NOW, but if the ZIL commit fails because the pool + * suspended, act according to the pool's failmode= setting (wait for + * the pool to resume, or return EIO). + */ + ZIL_COMMIT_FAILMODE = (1 << 1), +} zil_commit_flag_t; + typedef int zil_parse_blk_func_t(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t txg); typedef int zil_parse_lr_func_t(zilog_t *zilog, const lr_t *lr, void *arg, @@ -606,14 +629,16 @@ extern boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); -extern void zil_itx_destroy(itx_t *itx); +extern void zil_itx_destroy(itx_t *itx, int err); extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); -extern void zil_commit(zilog_t *zilog, uint64_t oid); -extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); extern void zil_remove_async(zilog_t *zilog, uint64_t oid); +extern int zil_commit_flags(zilog_t *zilog, uint64_t oid, + zil_commit_flag_t flags); +extern int __must_check zil_commit(zilog_t *zilog, uint64_t oid); + extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, struct dsl_dataset *ds, void *txarg); @@ -635,6 +660,8 @@ extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern uint64_t zil_max_copied_data(zilog_t *zilog); extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize); +extern itx_wr_state_t zil_write_state(zilog_t *zilog, uint64_t size, + uint32_t blocksize, boolean_t o_direct, boolean_t commit); extern void zil_sums_init(zil_sums_t *zs); extern void zil_sums_fini(zil_sums_t *zs); @@ -642,6 +669,8 @@ extern void zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums); extern int zil_replay_disable; +extern uint_t zfs_immediate_write_sz; +extern int zil_special_is_slog; #ifdef __cplusplus } diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h index 252264b9eae9..ea1364a7e35a 100644 --- a/sys/contrib/openzfs/include/sys/zil_impl.h +++ b/sys/contrib/openzfs/include/sys/zil_impl.h @@ -41,8 +41,8 @@ extern "C" { * * An lwb will start out in the "new" state, and transition to the "opened" * state via a call to zil_lwb_write_open() on first itx assignment. When - * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be - * held. + * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" and + * LWB's "lwb_lock" must be held. * * After the lwb is "opened", it can be assigned number of itxs and transition * into the "closed" state via zil_lwb_write_close() when full or on timeout. @@ -100,16 +100,22 @@ typedef enum { * holding the "zl_issuer_lock". After the lwb is issued, the zilog's * "zl_lock" is used to protect the lwb against concurrent access. */ +typedef enum { + LWB_FLAG_SLIM = (1<<0), /* log block has slim format */ + LWB_FLAG_SLOG = (1<<1), /* lwb_blk is on SLOG device */ + LWB_FLAG_CRASHED = (1<<2), /* lwb is on the crash list */ +} lwb_flag_t; + typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ - boolean_t lwb_slim; /* log block has slim format */ - boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + lwb_flag_t lwb_flags; /* extra info about this lwb */ int lwb_error; /* log block allocation error */ int lwb_nmax; /* max bytes in the buffer */ int lwb_nused; /* # used bytes in buffer */ int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ + int lwb_min_sz; /* min size for range allocation */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ zio_t *lwb_child_zio; /* parent zio for children */ @@ -124,7 +130,7 @@ typedef struct lwb { list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ - kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ + kmutex_t lwb_lock; /* protects lwb_vdev_tree and size */ } lwb_t; /* @@ -149,7 +155,7 @@ typedef struct zil_commit_waiter { list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */ lwb_t *zcw_lwb; /* back pointer to lwb when linked */ boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */ - int zcw_zio_error; /* contains the zio io_error value */ + int zcw_error; /* result to return from zil_commit() */ } zil_commit_waiter_t; /* @@ -221,6 +227,7 @@ struct zilog { uint64_t zl_cur_left; /* current burst remaining size */ uint64_t zl_cur_max; /* biggest record in current burst */ list_t zl_lwb_list; /* in-flight log write list */ + list_t zl_lwb_crash_list; /* log writes in-flight at crash */ avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ @@ -245,6 +252,9 @@ struct zilog { */ uint64_t zl_max_block_size; + /* After crash, txg to restart zil */ + uint64_t zl_restart_txg; + /* Pointer for per dataset zil sums */ zil_sums_t *zl_sums; }; diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index 78adca4d7d00..acb0a03a36b2 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -59,21 +59,37 @@ typedef struct zio_eck { /* * Gang block headers are self-checksumming and contain an array - * of block pointers. + * of block pointers. The old gang block size has enough room for 3 blkptrs, + * while new gang blocks can store more. + * + * Layout: + * +--------+--------+--------+-----+---------+-----------+ + * | | | | | | | + * | blkptr | blkptr | blkptr | ... | padding | zio_eck_t | + * | 1 | 2 | 3 | | | | + * +--------+--------+--------+-----+---------+-----------+ + * 128B 128B 128B 88B 40B */ -#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE -#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t)) / sizeof (blkptr_t)) -#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t) - \ - (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ - sizeof (uint64_t)) - -typedef struct zio_gbh { - blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; - uint64_t zg_filler[SPA_GBH_FILLER]; - zio_eck_t zg_tail; -} zio_gbh_phys_t; +#define SPA_OLD_GANGBLOCKSIZE SPA_MINBLOCKSIZE +typedef void zio_gbh_phys_t; + +static inline uint64_t +gbh_nblkptrs(uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t)); +} + +static inline zio_eck_t * +gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((zio_eck_t *)((uintptr_t)gbh + (size_t)size - + sizeof (zio_eck_t))); +} + +static inline blkptr_t * +gbh_bp(zio_gbh_phys_t *gbh, int bp) { + return (&((blkptr_t *)gbh)[bp]); +} enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, @@ -196,7 +212,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_DONT_RETRY (1ULL << 10) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) -#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) +#define ZIO_FLAG_ALLOC_THROTTLED (1ULL << 14) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -226,7 +242,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 29) #define ZIO_FLAG_REEXECUTED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 31) -#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32) +#define ZIO_FLAG_PREALLOCATED (1ULL << 32) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -345,25 +361,26 @@ struct zbookmark_err_phys { (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { - enum zio_checksum zp_checksum; - enum zio_compress zp_compress; + enum zio_checksum zp_checksum:8; + enum zio_compress zp_compress:8; uint8_t zp_complevel; uint8_t zp_level; uint8_t zp_copies; uint8_t zp_gang_copies; - dmu_object_type_t zp_type; - boolean_t zp_dedup; - boolean_t zp_dedup_verify; - boolean_t zp_nopwrite; - boolean_t zp_brtwrite; - boolean_t zp_encrypt; - boolean_t zp_byteorder; - boolean_t zp_direct_write; + dmu_object_type_t zp_type:8; + dmu_object_type_t zp_storage_type:8; + boolean_t zp_dedup:1; + boolean_t zp_dedup_verify:1; + boolean_t zp_nopwrite:1; + boolean_t zp_brtwrite:1; + boolean_t zp_encrypt:1; + boolean_t zp_byteorder:1; + boolean_t zp_direct_write:1; + boolean_t zp_rewrite:1; + uint32_t zp_zpl_smallblk; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; - uint32_t zp_zpl_smallblk; - dmu_object_type_t zp_storage_type; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; @@ -398,7 +415,9 @@ typedef struct zio_vsd_ops { typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; - struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; + uint64_t gn_gangblocksize; + uint64_t gn_allocsize; + struct zio_gang_node *gn_child[]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, @@ -417,14 +436,16 @@ typedef struct zio_transform { typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* - * The io_reexecute flags are distinct from io_flags because the child must - * be able to propagate them to the parent. The normal io_flags are local - * to the zio, not protected by any lock, and not modifiable by children; - * the reexecute flags are protected by io_lock, modifiable by children, - * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. + * The io_post flags describe additional actions that a parent IO should + * consider or perform on behalf of a child. They are distinct from io_flags + * because the child must be able to propagate them to the parent. The normal + * io_flags are local to the zio, not protected by any lock, and not modifiable + * by children; the reexecute flags are protected by io_lock, modifiable by + * children, and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ -#define ZIO_REEXECUTE_NOW 0x01 -#define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_POST_REEXECUTE (1 << 0) +#define ZIO_POST_SUSPEND (1 << 1) +#define ZIO_POST_DIO_CHKSUM_ERR (1 << 2) /* * The io_trim flags are used to specify the type of TRIM to perform. They @@ -460,7 +481,7 @@ struct zio { enum zio_child io_child_type; enum trim_flag io_trim_flags; zio_priority_t io_priority; - uint8_t io_reexecute; + uint8_t io_post; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; @@ -602,7 +623,8 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, - blkptr_t *new_bp, uint64_t size, boolean_t *slog); + blkptr_t *new_bp, uint64_t min_size, uint64_t max_size, boolean_t *slog, + boolean_t allow_larger); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); @@ -620,7 +642,6 @@ extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); -extern void zio_add_child_first(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); @@ -697,6 +718,7 @@ extern void zio_handle_ignored_writes(zio_t *zio); extern hrtime_t zio_handle_io_delay(zio_t *zio); extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed); extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed); +extern hrtime_t zio_handle_ready_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/sys/contrib/openzfs/include/sys/zvol.h b/sys/contrib/openzfs/include/sys/zvol.h index 32e703650935..5791246e99e4 100644 --- a/sys/contrib/openzfs/include/sys/zvol.h +++ b/sys/contrib/openzfs/include/sys/zvol.h @@ -36,8 +36,7 @@ #define SPEC_MAXOFFSET_T ((1LL << ((NBBY * sizeof (daddr32_t)) + \ DEV_BSHIFT - 1)) - 1) -extern void zvol_create_minor(const char *); -extern void zvol_create_minors_recursive(const char *); +extern void zvol_create_minors(const char *); extern void zvol_remove_minors(spa_t *, const char *, boolean_t); extern void zvol_rename_minors(spa_t *, const char *, const char *, boolean_t); @@ -54,7 +53,7 @@ extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volthreading(const char *, boolean_t); extern int zvol_set_common(const char *, zfs_prop_t, zprop_source_t, uint64_t); extern int zvol_set_ro(const char *, boolean_t); -extern zvol_state_handle_t *zvol_suspend(const char *); +extern int zvol_suspend(const char *, zvol_state_handle_t **); extern int zvol_resume(zvol_state_handle_t *); extern void *zvol_tag(zvol_state_handle_t *); diff --git a/sys/contrib/openzfs/include/sys/zvol_impl.h b/sys/contrib/openzfs/include/sys/zvol_impl.h index 3a40b40f7f3d..5422e66832c0 100644 --- a/sys/contrib/openzfs/include/sys/zvol_impl.h +++ b/sys/contrib/openzfs/include/sys/zvol_impl.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #ifndef _SYS_ZVOL_IMPL_H @@ -56,10 +56,37 @@ typedef struct zvol_state { atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ kcondvar_t zv_removing_cv; /* ready to remove minor */ + list_node_t zv_remove_node; /* node on removal list */ struct zvol_state_os *zv_zso; /* private platform state */ boolean_t zv_threading; /* volthreading property */ } zvol_state_t; +/* + * zvol taskqs + */ +typedef struct zv_taskq { + uint_t tqs_cnt; + taskq_t **tqs_taskq; +} zv_taskq_t; + +typedef struct zv_request_stack { + zvol_state_t *zv; + struct bio *bio; +#ifdef __linux__ + struct request *rq; +#endif +} zv_request_t; + +typedef struct zv_request_task { + zv_request_t zvr; + taskq_ent_t ent; +} zv_request_task_t; + +/* + * Switch taskq at multiple of 512 MB offset. This can be set to a lower value + * to utilize more threads for small files but may affect prefetch hits. + */ +#define ZVOL_TASKQ_OFFSET_SHIFT 29 extern krwlock_t zvol_state_lock; #define ZVOL_HT_SIZE 1024 @@ -67,8 +94,13 @@ extern struct hlist_head *zvol_htable; #define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE]; -extern unsigned int zvol_volmode; extern unsigned int zvol_inhibit_dev; +extern unsigned int zvol_prefetch_bytes; +extern unsigned int zvol_volmode; +extern unsigned int zvol_threads; +extern unsigned int zvol_num_taskqs; +extern unsigned int zvol_request_sync; +extern zv_taskq_t zvol_taskqs; /* * platform independent functions exported to platform code @@ -77,7 +109,6 @@ zvol_state_t *zvol_find_by_name_hash(const char *name, uint64_t hash, int mode); int zvol_first_open(zvol_state_t *zv, boolean_t readonly); uint64_t zvol_name_hash(const char *name); -void zvol_remove_minors_impl(const char *name); void zvol_last_close(zvol_state_t *zv); void zvol_insert(zvol_state_t *zv); void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, @@ -94,16 +125,18 @@ int zvol_clone_range(zvol_state_handle_t *, uint64_t, void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps); +zv_request_task_t *zv_request_task_create(zv_request_t zvr); +void zv_request_task_free(zv_request_task_t *task); /* * platform dependent functions exported to platform independent code */ void zvol_os_free(zvol_state_t *zv); -void zvol_os_rename_minor(zvol_state_t *zv, const char *newname); +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname); int zvol_os_create_minor(const char *name); int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize); boolean_t zvol_os_is_zvol(const char *path); -void zvol_os_clear_private(zvol_state_t *zv); +void zvol_os_remove_minor(zvol_state_t *zv); void zvol_os_set_disk_ro(zvol_state_t *zv, int flags); void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity); diff --git a/sys/contrib/openzfs/include/zfeature_common.h b/sys/contrib/openzfs/include/zfeature_common.h index 85537c1ae96e..56382ca85b55 100644 --- a/sys/contrib/openzfs/include/zfeature_common.h +++ b/sys/contrib/openzfs/include/zfeature_common.h @@ -87,6 +87,9 @@ typedef enum spa_feature { SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, SPA_FEATURE_LARGE_MICROZAP, + SPA_FEATURE_DYNAMIC_GANG_HEADER, + SPA_FEATURE_BLOCK_CLONING_ENDIAN, + SPA_FEATURE_PHYSICAL_REWRITE, SPA_FEATURES } spa_feature_t; @@ -103,7 +106,15 @@ typedef enum zfeature_flags { /* Activate this feature at the same time it is enabled. */ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), /* Each dataset has a field set if it has ever used this feature. */ - ZFEATURE_FLAG_PER_DATASET = (1 << 3) + ZFEATURE_FLAG_PER_DATASET = (1 << 3), + /* + * This feature isn't enabled by zpool upgrade; it must be explicitly + * listed to be enabled. It will also be applied if listed in an + * explicitly provided compatibility list. This flag can be removed + * from a given feature once support is sufficiently widespread, or + * worries about backwards compatibility are no longer relevant. + */ + ZFEATURE_FLAG_NO_UPGRADE = (1 << 4) } zfeature_flags_t; typedef enum zfeature_type { diff --git a/sys/contrib/openzfs/include/zfs_crrd.h b/sys/contrib/openzfs/include/zfs_crrd.h new file mode 100644 index 000000000000..ba192a2062ea --- /dev/null +++ b/sys/contrib/openzfs/include/zfs_crrd.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024 Klara Inc. + * + * This software was developed by + * Mariusz Zaborski <mariusz.zaborski@klarasystems.com> + * Fred Weigel <fred.weigel@klarasystems.com> + * under sponsorship from Wasabi Technology, Inc. and Klara Inc. + */ + +#ifndef _CRRD_H_ +#define _CRRD_H_ + +#define RRD_MAX_ENTRIES 256 + +#define RRD_ENTRY_SIZE sizeof (uint64_t) +#define RRD_STRUCT_ELEM (sizeof (rrd_t) / RRD_ENTRY_SIZE) + +typedef enum { + DBRRD_FLOOR, + DBRRD_CEILING +} dbrrd_rounding_t; + +typedef struct { + uint64_t rrdd_time; + uint64_t rrdd_txg; +} rrd_data_t; + +typedef struct { + uint64_t rrd_head; /* head (beginning) */ + uint64_t rrd_tail; /* tail (end) */ + uint64_t rrd_length; + + rrd_data_t rrd_entries[RRD_MAX_ENTRIES]; +} rrd_t; + +typedef struct { + rrd_t dbr_minutes; + rrd_t dbr_days; + rrd_t dbr_months; +} dbrrd_t; + +size_t rrd_len(rrd_t *rrd); + +const rrd_data_t *rrd_entry(rrd_t *r, size_t i); +rrd_data_t *rrd_tail_entry(rrd_t *rrd); +uint64_t rrd_tail(rrd_t *rrd); +uint64_t rrd_get(rrd_t *rrd, size_t i); + +void rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg); + +void dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg); +uint64_t dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rouding); + +#endif diff --git a/sys/contrib/openzfs/include/zfs_deleg.h b/sys/contrib/openzfs/include/zfs_deleg.h index f80fe46d35f8..a7bbf1620ad5 100644 --- a/sys/contrib/openzfs/include/zfs_deleg.h +++ b/sys/contrib/openzfs/include/zfs_deleg.h @@ -55,6 +55,7 @@ typedef enum { ZFS_DELEG_NOTE_PROMOTE, ZFS_DELEG_NOTE_RENAME, ZFS_DELEG_NOTE_SEND, + ZFS_DELEG_NOTE_SEND_RAW, ZFS_DELEG_NOTE_RECEIVE, ZFS_DELEG_NOTE_ALLOW, ZFS_DELEG_NOTE_USERPROP, diff --git a/sys/contrib/openzfs/include/zfs_valstr.h b/sys/contrib/openzfs/include/zfs_valstr.h index 295449396c51..27b4c9ebb239 100644 --- a/sys/contrib/openzfs/include/zfs_valstr.h +++ b/sys/contrib/openzfs/include/zfs_valstr.h @@ -73,6 +73,7 @@ extern "C" { _ZFS_VALSTR_DECLARE_BITFIELD(zio_flag) _ZFS_VALSTR_DECLARE_BITFIELD(zio_stage) +_ZFS_VALSTR_DECLARE_ENUM(zio_type) _ZFS_VALSTR_DECLARE_ENUM(zio_priority) #undef _ZFS_VALSTR_DECLARE_BITFIELD |