diff options
author | Martin Matuska <mm@FreeBSD.org> | 2024-02-08 12:39:04 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2024-02-08 15:51:08 +0000 |
commit | a4e5e0106ac7145f56eb39a691e302cabb4635be (patch) | |
tree | 8dac13394184eb6bc756212b00c57c85c1e7f276 | |
parent | 4594eb454891e6247a6ea786f467a0b960ddd835 (diff) | |
parent | 229b9f4ed05e6d14fb4d73fa04a71e99b01bb534 (diff) | |
download | src-a4e5e0106ac7145f56eb39a691e302cabb4635be.tar.gz src-a4e5e0106ac7145f56eb39a691e302cabb4635be.zip |
zfs: merge openzfs/zfs@229b9f4ed
Notable upstream pull request merges:
#15769 082338875 Add 'zpool status -e' flag to see unhealthy vdevs
#15804 a0d3fe72b libzdb: Initial breakout of libzdb
#15847 229b9f4ed LUA: Backport CVE-2020-24370's patch
Obtained from: OpenZFS
OpenZFS commit: 229b9f4ed05e6d14fb4d73fa04a71e99b01bb534
37 files changed, 736 insertions, 181 deletions
diff --git a/cddl/lib/Makefile b/cddl/lib/Makefile index 68250c8300b1..e6a0763544f7 100644 --- a/cddl/lib/Makefile +++ b/cddl/lib/Makefile @@ -20,6 +20,7 @@ SUBDIR.${MK_ZFS}+= \ libtpool \ libumem \ libuutil \ + libzdb \ libzfs \ libzfs_core \ libzfsbootenv \ diff --git a/cddl/lib/libzdb/Makefile b/cddl/lib/libzdb/Makefile new file mode 100644 index 000000000000..63248399b91c --- /dev/null +++ b/cddl/lib/libzdb/Makefile @@ -0,0 +1,25 @@ +.PATH: ${SRCTOP}/sys/contrib/openzfs/lib/libzdb +.PATH: ${SRCTOP}/sys/contrib/openzfs/include + +LIB= zdb +PACKAGE= zfs + +INCS = libzdb.h + +SRCS = libzdb.c + +WARNS?= 2 +CSTD= c99 + +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libspl/include/os/freebsd +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/zfs +CFLAGS+= -I${SRCTOP}/sys +CFLAGS+= -I${SRCTOP}/cddl/compat/opensolaris/include +CFLAGS+= -include ${SRCTOP}/sys/contrib/openzfs/include/os/freebsd/spl/sys/ccompile.h +CFLAGS+= -I${SRCTOP}/sys/contrib/openzfs/lib/libzutil +CFLAGS+= -DHAVE_ISSETUGID -DIN_BASE +CFLAGS+= -include ${SRCTOP}/sys/modules/zfs/zfs_config.h + +.include <bsd.lib.mk> diff --git a/cddl/lib/libzdb/Makefile.depend b/cddl/lib/libzdb/Makefile.depend new file mode 100644 index 000000000000..93249906da4f --- /dev/null +++ b/cddl/lib/libzdb/Makefile.depend @@ -0,0 +1,14 @@ +# Autogenerated - do NOT edit! + +DIRDEPS = \ + include \ + lib/${CSU_DIR} \ + lib/libc \ + lib/libcompiler_rt \ + + +.include <dirdeps.mk> + +.if ${DEP_RELDIR} == ${_DEP_RELDIR} +# local dependencies - needed for -jN in clean tree +.endif diff --git a/cddl/usr.sbin/zdb/Makefile b/cddl/usr.sbin/zdb/Makefile index 744db789772c..e41f4afce82f 100644 --- a/cddl/usr.sbin/zdb/Makefile +++ b/cddl/usr.sbin/zdb/Makefile @@ -23,7 +23,7 @@ CFLAGS+= \ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ -DHAVE_ISSETUGID -LIBADD= nvpair umem uutil zfs spl avl zutil zpool crypto +LIBADD= nvpair umem uutil zdb zfs spl avl zutil zpool crypto CFLAGS.gcc+= -fms-extensions # Since there are many asserts in this program, it makes no sense to compile diff --git a/rescue/rescue/Makefile b/rescue/rescue/Makefile index 7bf3299f4d48..0a8d142ef83a 100644 --- a/rescue/rescue/Makefile +++ b/rescue/rescue/Makefile @@ -153,7 +153,7 @@ CRUNCH_LIBS_zfs+= ${LIBBE} \ ${LIBNVPAIR} CRUNCH_LIBS_bectl+= ${CRUNCH_LIBS_zfs} CRUNCH_LIBS_zpool+= ${CRUNCH_LIBS_zfs} -CRUNCH_LIBS_zdb+= ${CRUNCH_LIBS_zfs} +CRUNCH_LIBS_zdb+= ${CRUNCH_LIBS_zfs} ${LIBZDB} .else # liblzma needs pthread CRUNCH_LIBS+= -lpthread diff --git a/share/mk/bsd.libnames.mk b/share/mk/bsd.libnames.mk index db08a5ac718c..414ae3164066 100644 --- a/share/mk/bsd.libnames.mk +++ b/share/mk/bsd.libnames.mk @@ -167,6 +167,7 @@ LIBXPG4?= ${LIBDESTDIR}${LIBDIR_BASE}/libxpg4.a LIBY?= ${LIBDESTDIR}${LIBDIR_BASE}/liby.a LIBYPCLNT?= ${LIBDESTDIR}${LIBDIR_BASE}/libypclnt.a LIBZ?= ${LIBDESTDIR}${LIBDIR_BASE}/libz.a +LIBZDB?= ${LIBDESTDIR}${LIBDIR_BASE}/libzdb.a LIBZFS?= ${LIBDESTDIR}${LIBDIR_BASE}/libzfs.a LIBZFS_CORE?= ${LIBDESTDIR}${LIBDIR_BASE}/libzfs_core.a LIBZFSBOOTENV?= ${LIBDESTDIR}${LIBDIR_BASE}/libzfsbootenv.a diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk index 5a6932614b29..658dd1c3d699 100644 --- a/share/mk/src.libnames.mk +++ b/share/mk/src.libnames.mk @@ -223,6 +223,7 @@ _LIBRARIES= \ y \ ypclnt \ z \ + zdb \ zfs_core \ zfs \ zfsbootenv \ @@ -683,6 +684,8 @@ LIBNVPAIRDIR= ${_LIB_OBJTOP}/cddl/lib/libnvpair LIBNVPAIR?= ${LIBNVPAIRDIR}/libnvpair${PIE_SUFFIX}.a LIBUMEMDIR= ${_LIB_OBJTOP}/cddl/lib/libumem LIBUUTILDIR= ${_LIB_OBJTOP}/cddl/lib/libuutil +LIBZDBDIR= ${_LIB_OBJTOP}/cddl/lib/libzdb +LIBZDB?= ${LIBZDBDIR}/libzdb${PIE_SUFFIX}.a LIBZFSDIR= ${_LIB_OBJTOP}/cddl/lib/libzfs LIBZFS?= ${LIBZFSDIR}/libzfs${PIE_SUFFIX}.a LIBZFS_COREDIR= ${_LIB_OBJTOP}/cddl/lib/libzfs_core diff --git a/sys/contrib/openzfs/cmd/zdb/Makefile.am b/sys/contrib/openzfs/cmd/zdb/Makefile.am index c93c9c37cd8d..ebdc19128e1a 100644 --- a/sys/contrib/openzfs/cmd/zdb/Makefile.am +++ b/sys/contrib/openzfs/cmd/zdb/Makefile.am @@ -10,6 +10,7 @@ zdb_SOURCES = \ %D%/zdb_il.c zdb_LDADD = \ + libzdb.la \ libzpool.la \ libzfs_core.la \ libnvpair.la diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 2062f4fa1026..afdc5a2c8b54 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -88,36 +88,10 @@ #include <libnvpair.h> #include <libzutil.h> -#include "zdb.h" +#include <libzdb.h> -#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ - zio_compress_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ - zio_checksum_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ - (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ - DMU_OT_ZAP_OTHER : \ - (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ - DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) - -/* Some platforms require part of inode IDs to be remapped */ -#ifdef __APPLE__ -#define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) -#else -#define ZDB_MAP_OBJECT_ID(obj) (obj) -#endif +#include "zdb.h" -static const char * -zdb_ot_name(dmu_object_type_t type) -{ - if (type < DMU_OT_NUMTYPES) - return (dmu_ot[type].ot_name); - else if ((type & DMU_OT_NEWTYPE) && - ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) - return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); - else - return ("UNKNOWN"); -} extern int reference_tracking_enable; extern int zfs_recover; @@ -135,35 +109,12 @@ typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); static uint64_t *zopt_metaslab = NULL; static unsigned zopt_metaslab_args = 0; -typedef struct zopt_object_range { - uint64_t zor_obj_start; - uint64_t zor_obj_end; - uint64_t zor_flags; -} zopt_object_range_t; static zopt_object_range_t *zopt_object_ranges = NULL; static unsigned zopt_object_args = 0; static int flagbits[256]; -#define ZOR_FLAG_PLAIN_FILE 0x0001 -#define ZOR_FLAG_DIRECTORY 0x0002 -#define ZOR_FLAG_SPACE_MAP 0x0004 -#define ZOR_FLAG_ZAP 0x0008 -#define ZOR_FLAG_ALL_TYPES -1 -#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ - ZOR_FLAG_DIRECTORY | \ - ZOR_FLAG_SPACE_MAP | \ - ZOR_FLAG_ZAP) - -#define ZDB_FLAG_CHECKSUM 0x0001 -#define ZDB_FLAG_DECOMPRESS 0x0002 -#define ZDB_FLAG_BSWAP 0x0004 -#define ZDB_FLAG_GBH 0x0008 -#define ZDB_FLAG_INDIRECT 0x0010 -#define ZDB_FLAG_RAW 0x0020 -#define ZDB_FLAG_PRINT_BLKPTR 0x0040 -#define ZDB_FLAG_VERBOSE 0x0080 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; @@ -176,62 +127,7 @@ static void mos_obj_refd_multiple(uint64_t); static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx); -typedef struct sublivelist_verify { - /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ - zfs_btree_t sv_pair; - - /* ALLOC's without a matching FREE, accumulates across sub-livelists */ - zfs_btree_t sv_leftover; -} sublivelist_verify_t; - -static int -livelist_compare(const void *larg, const void *rarg) -{ - const blkptr_t *l = larg; - const blkptr_t *r = rarg; - /* Sort them according to dva[0] */ - uint64_t l_dva0_vdev, r_dva0_vdev; - l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); - r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); - if (l_dva0_vdev < r_dva0_vdev) - return (-1); - else if (l_dva0_vdev > r_dva0_vdev) - return (+1); - - /* if vdevs are equal, sort by offsets. */ - uint64_t l_dva0_offset; - uint64_t r_dva0_offset; - l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); - r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); - if (l_dva0_offset < r_dva0_offset) { - return (-1); - } else if (l_dva0_offset > r_dva0_offset) { - return (+1); - } - - /* - * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, - * it's possible the offsets are equal. In that case, sort by txg - */ - if (l->blk_birth < r->blk_birth) { - return (-1); - } else if (l->blk_birth > r->blk_birth) { - return (+1); - } - return (0); -} - -typedef struct sublivelist_verify_block { - dva_t svb_dva; - - /* - * We need this to check if the block marked as allocated - * in the livelist was freed (and potentially reallocated) - * in the metaslab spacemaps at a later TXG. - */ - uint64_t svb_allocated_txg; -} sublivelist_verify_block_t; static void zdb_print_blkptr(const blkptr_t *bp, int flags); diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 11486f3f185e..8753d7263914 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -2161,6 +2161,7 @@ typedef struct status_cbdata { boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; + boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; @@ -2358,6 +2359,35 @@ health_str_to_color(const char *health) } /* + * Called for each leaf vdev. Returns 0 if the vdev is healthy. + * A vdev is unhealthy if any of the following are true: + * 1) there are read, write, or checksum errors, + * 2) its state is not ONLINE, or + * 3) slow IO reporting was requested (-s) and there are slow IOs. + */ +static int +vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) +{ + status_cbdata_t *cb = data; + vdev_stat_t *vs; + uint_t vsc; + (void) hdl_data; + + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) != 0) + return (1); + + if (vs->vs_checksum_errors || vs->vs_read_errors || + vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) + return (1); + + if (cb->cb_print_slow_ios && vs->vs_slow_ios) + return (1); + + return (0); +} + +/* * Print out configuration state as requested by status_callback. */ static void @@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, const char *state; const char *type; const char *path = NULL; - const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; + const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, + *scolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, state = gettext("AVAIL"); } + /* + * If '-e' is specified then top-level vdevs and their children + * can be pruned if all of their leaves are healthy. + */ + if (cb->cb_print_unhealthy && depth > 0 && + for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { + return; + } + printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); @@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (vs->vs_checksum_errors) ccolor = ANSI_RED; + if (vs->vs_slow_ios) + scolor = ANSI_BLUE; + if (cb->cb_literal) { fputc(' ', stdout); printf_color(rcolor, "%5llu", @@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } if (cb->cb_literal) - printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + printf_color(scolor, " %5llu", + (u_longlong_t)vs->vs_slow_ios); else - printf(" %5s", rbuf); + printf_color(scolor, " %5s", rbuf); } if (cb->cb_print_power) { if (children == 0) { @@ -9106,9 +9150,11 @@ status_callback(zpool_handle_t *zhp, void *data) (void) printf(gettext( "errors: No known data errors\n")); } else if (!cbp->cb_verbose) { + color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); + color_end(); } else { print_error_log(zhp); } @@ -9129,6 +9175,7 @@ status_callback(zpool_handle_t *zhp, void *data) * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -e Display only unhealthy vdevs * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. @@ -9160,7 +9207,7 @@ zpool_do_status(int argc, char **argv) }; /* check options */ - while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options, + while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -9187,6 +9234,9 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'e': + cb.cb_print_unhealthy = B_TRUE; + break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; diff --git a/sys/contrib/openzfs/config/kernel-blkdev.m4 b/sys/contrib/openzfs/config/kernel-blkdev.m4 index 8e9e638b125a..c5a353ca9203 100644 --- a/sys/contrib/openzfs/config/kernel-blkdev.m4 +++ b/sys/contrib/openzfs/config/kernel-blkdev.m4 @@ -524,6 +524,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ dnl # dnl # 5.19 API: blkdev_issue_secure_erase() +dnl # 4.7 API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ @@ -539,6 +540,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ sector, nr_sects, GFP_KERNEL); ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [ + #include <linux/blkdev.h> + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + unsigned long flags = 0; + struct bio *biop = NULL; + int error __attribute__ ((unused)); + + error = __blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, flags, &biop); + ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ #include <linux/blkdev.h> ],[ @@ -562,13 +577,22 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, - [blkdev_issue_discard() is available]) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1, + [__blkdev_issue_discard() is available]) ],[ - ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, + [blkdev_issue_discard() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + ]) ]) ]) ]) diff --git a/sys/contrib/openzfs/include/Makefile.am b/sys/contrib/openzfs/include/Makefile.am index 5f38f6ac6ddb..cb28a2d6c96c 100644 --- a/sys/contrib/openzfs/include/Makefile.am +++ b/sys/contrib/openzfs/include/Makefile.am @@ -186,6 +186,7 @@ USER_H = \ libuutil.h \ libuutil_common.h \ libuutil_impl.h \ + libzdb.h \ libzfs.h \ libzfs_core.h \ libzfsbootenv.h \ diff --git a/sys/contrib/openzfs/include/libzdb.h b/sys/contrib/openzfs/include/libzdb.h new file mode 100644 index 000000000000..ef910d0a2c5a --- /dev/null +++ b/sys/contrib/openzfs/include/libzdb.h @@ -0,0 +1,68 @@ +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ + (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ + DMU_OT_ZAP_OTHER : \ + (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ + DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) + +/* Some platforms require part of inode IDs to be remapped */ +#ifdef __APPLE__ +#define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) +#else +#define ZDB_MAP_OBJECT_ID(obj) (obj) +#endif + +#define ZOR_FLAG_PLAIN_FILE 0x0001 +#define ZOR_FLAG_DIRECTORY 0x0002 +#define ZOR_FLAG_SPACE_MAP 0x0004 +#define ZOR_FLAG_ZAP 0x0008 +#define ZOR_FLAG_ALL_TYPES -1 +#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ + ZOR_FLAG_DIRECTORY | \ + ZOR_FLAG_SPACE_MAP | \ + ZOR_FLAG_ZAP) + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + + +typedef struct zdb_ctx { +} zdb_ctx_t; + +typedef struct zopt_object_range { + uint64_t zor_obj_start; + uint64_t zor_obj_end; + uint64_t zor_flags; +} zopt_object_range_t; + + +typedef struct sublivelist_verify { + /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ + zfs_btree_t sv_pair; + + /* ALLOC's without a matching FREE, accumulates across sub-livelists */ + zfs_btree_t sv_leftover; +} sublivelist_verify_t; + +typedef struct sublivelist_verify_block { + dva_t svb_dva; + + /* + * We need this to check if the block marked as allocated + * in the livelist was freed (and potentially reallocated) + * in the metaslab spacemaps at a later TXG. + */ + uint64_t svb_allocated_txg; +} sublivelist_verify_block_t; + +const char *zdb_ot_name(dmu_object_type_t type); +int livelist_compare(const void *larg, const void *rarg); diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index e7ebcccbe0ce..7f0f24325d59 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -285,7 +285,6 @@ typedef struct zfid_long { #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern int zfs_super_owner; -extern int zfs_bclone_enabled; extern void zfs_init(void); extern void zfs_fini(void); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h index 220466550258..b4d5db21f5e5 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -45,8 +45,6 @@ extern "C" { typedef struct zfsvfs zfsvfs_t; struct znode; -extern int zfs_bclone_enabled; - /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural diff --git a/sys/contrib/openzfs/include/sys/zfs_vnops.h b/sys/contrib/openzfs/include/sys/zfs_vnops.h index 5da103f17783..e60b99bed192 100644 --- a/sys/contrib/openzfs/include/sys/zfs_vnops.h +++ b/sys/contrib/openzfs/include/sys/zfs_vnops.h @@ -24,8 +24,11 @@ #ifndef _SYS_FS_ZFS_VNOPS_H #define _SYS_FS_ZFS_VNOPS_H + #include <sys/zfs_vnops_os.h> +extern int zfs_bclone_enabled; + extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); diff --git a/sys/contrib/openzfs/lib/Makefile.am b/sys/contrib/openzfs/lib/Makefile.am index 499ebdaeba9b..050a6cac0a37 100644 --- a/sys/contrib/openzfs/lib/Makefile.am +++ b/sys/contrib/openzfs/lib/Makefile.am @@ -9,11 +9,11 @@ # These library interfaces are subject to change at any time. # # -# CMDS: zhack/ztest/zdb/ zfs/zpool/zed/ +# CMDS: zhack/ztest/ zfs/zpool/zed/ # raidz_{test,bench} zinject/zstream # | | # LIBS: | | libzfsbootenv* -# | | | +# |--libzdb--zdb | | # | | | # libzpool libzfs* ----------------+ # | | | \ / | | | @@ -62,6 +62,7 @@ include $(srcdir)/%D%/libspl/Makefile.am include $(srcdir)/%D%/libtpool/Makefile.am include $(srcdir)/%D%/libunicode/Makefile.am include $(srcdir)/%D%/libuutil/Makefile.am +include $(srcdir)/%D%/libzdb/Makefile.am include $(srcdir)/%D%/libzfs_core/Makefile.am include $(srcdir)/%D%/libzfs/Makefile.am include $(srcdir)/%D%/libzfsbootenv/Makefile.am diff --git a/sys/contrib/openzfs/lib/libzdb/Makefile.am b/sys/contrib/openzfs/lib/libzdb/Makefile.am new file mode 100644 index 000000000000..ec4fd92b984e --- /dev/null +++ b/sys/contrib/openzfs/lib/libzdb/Makefile.am @@ -0,0 +1,7 @@ +libzdb_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) +libzdb_la_CFLAGS += -fvisibility=hidden + +noinst_LTLIBRARIES += libzdb.la + +libzdb_la_SOURCES = \ + %D%/libzdb.c diff --git a/sys/contrib/openzfs/lib/libzdb/libzdb.c b/sys/contrib/openzfs/lib/libzdb/libzdb.c new file mode 100644 index 000000000000..9989fa1eb80f --- /dev/null +++ b/sys/contrib/openzfs/lib/libzdb/libzdb.c @@ -0,0 +1,102 @@ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <ctype.h> +#include <getopt.h> +#include <openssl/evp.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_sa.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab_impl.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_bookmark.h> +#include <sys/dbuf.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/dmu_send.h> +#include <sys/dmu_traverse.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/zfs_fuid.h> +#include <sys/arc.h> +#include <sys/arc_impl.h> +#include <sys/ddt.h> +#include <sys/zfeature.h> +#include <sys/abd.h> +#include <sys/blkptr.h> +#include <sys/dsl_crypt.h> +#include <sys/dsl_scan.h> +#include <sys/btree.h> +#include <sys/brt.h> +#include <sys/brt_impl.h> +#include <zfs_comutil.h> +#include <sys/zstd/zstd.h> + +#include <libnvpair.h> +#include <libzutil.h> + +#include <libzdb.h> + +const char * +zdb_ot_name(dmu_object_type_t type) +{ + if (type < DMU_OT_NUMTYPES) + return (dmu_ot[type].ot_name); + else if ((type & DMU_OT_NEWTYPE) && + ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) + return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); + else + return ("UNKNOWN"); +} + +int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = larg; + const blkptr_t *r = rarg; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev, r_dva0_vdev; + l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + if (l_dva0_vdev < r_dva0_vdev) + return (-1); + else if (l_dva0_vdev > r_dva0_vdev) + return (+1); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset; + uint64_t r_dva0_offset; + l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset < r_dva0_offset) { + return (-1); + } else if (l_dva0_offset > r_dva0_offset) { + return (+1); + } + + /* + * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, + * it's possible the offsets are equal. In that case, sort by txg + */ + if (l->blk_birth < r->blk_birth) { + return (-1); + } else if (l->blk_birth > r->blk_birth) { + return (+1); + } + return (0); +} diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 47471a805907..30c168253f96 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -1159,6 +1159,15 @@ Enable the experimental block cloning feature. If this setting is 0, then even if feature@block_cloning is enabled, attempts to clone blocks will act as though the feature is disabled. . +.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int +When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be +written to disk. +This allows the clone operation to reliably succeed when a file is +modified and then immediately cloned. +For small files this may be slower than making a copy of the file. +Therefore, this setting defaults to 0 which causes a clone operation to +immediately fail when encountering a dirty block. +. .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp diff --git a/sys/contrib/openzfs/man/man8/zpool-status.8 b/sys/contrib/openzfs/man/man8/zpool-status.8 index 56fa4aed057b..24ad6e643cae 100644 --- a/sys/contrib/openzfs/man/man8/zpool-status.8 +++ b/sys/contrib/openzfs/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DigLpPstvx +.Op Fl DeigLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -69,6 +69,8 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl e +Only show unhealthy vdevs (not-ONLINE or with errors). .It Fl i Display vdev initialization status. .It Fl g diff --git a/sys/contrib/openzfs/module/lua/ldebug.c b/sys/contrib/openzfs/module/lua/ldebug.c index 0092474c762d..23e321bb1247 100644 --- a/sys/contrib/openzfs/module/lua/ldebug.c +++ b/sys/contrib/openzfs/module/lua/ldebug.c @@ -111,10 +111,11 @@ static const char *upvalname (Proto *p, int uv) { static const char *findvararg (CallInfo *ci, int n, StkId *pos) { int nparams = clLvalue(ci->func)->p->numparams; - if (n >= ci->u.l.base - ci->func - nparams) + int nvararg = cast_int(ci->u.l.base - ci->func) - nparams; + if (n <= -nvararg) return NULL; /* no such vararg */ else { - *pos = ci->func + nparams + n; + *pos = ci->func + nparams - n; return "(*vararg)"; /* generic name for any vararg */ } } @@ -126,7 +127,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n, StkId base; if (isLua(ci)) { if (n < 0) /* access to vararg values? */ - return findvararg(ci, -n, pos); + return findvararg(ci, n, pos); else { base = ci->u.l.base; name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci)); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index f2d5391037c4..a972c720dfdb 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -89,10 +89,6 @@ int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); -int zfs_bclone_enabled = 1; -SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, - &zfs_bclone_enabled, 0, "Enable block cloning"); - struct zfs_jailparam { int mount_snapshot; }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index e7f0aa573848..b0bda5fa2012 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -862,27 +862,66 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) +BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) +{ + zio_t *zio = bio->bi_private; +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = -error; +#endif + bio_put(bio); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} + static int -vdev_disk_io_trim(zio_t *zio) +vdev_issue_discard_trim(zio_t *zio, unsigned long flags) { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; + int ret; + struct bio *bio = NULL; -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) { - return (-blkdev_issue_secure_erase(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } else { - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); +#if defined(BLKDEV_DISCARD_SECURE) + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); +#else + (void) flags; + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); +#endif + if (!ret && bio) { + bio->bi_private = zio; + bio->bi_end_io = vdev_disk_discard_end_io; + vdev_submit_bio(bio); } -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) + return (ret); +} +#endif + +static int +vdev_disk_io_trim(zio_t *zio) +{ unsigned long trim_flags = 0; -#if defined(BLKDEV_DISCARD_SECURE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) { +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + return (-blkdev_issue_secure_erase( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); +#elif defined(BLKDEV_DISCARD_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), + } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) + return (vdev_issue_discard_trim(zio, trim_flags)); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) + return (-blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" @@ -968,7 +1007,12 @@ vdev_disk_io_start(zio_t *zio) case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + zio_interrupt(zio); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) zio_interrupt(zio); +#endif return; default: diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index b7b89b8afc56..a32307c39331 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -4255,9 +4255,4 @@ EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); - -/* CSTYLED */ -module_param(zfs_bclone_enabled, uint, 0644); -MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning"); - #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c index 73476ff40ebf..3065d54fa9da 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c @@ -31,8 +31,6 @@ #include <sys/zfs_vnops.h> #include <sys/zfeature.h> -int zfs_bclone_enabled = 1; - /* * Clone part of a file via block cloning. * @@ -40,7 +38,7 @@ int zfs_bclone_enabled = 1; * care of that depending on how it was called. */ static ssize_t -__zpl_clone_file_range(struct file *src_file, loff_t src_off, +zpl_clone_file_range_impl(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len) { struct inode *src_i = file_inode(src_file); @@ -96,11 +94,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, { ssize_t ret; + /* Flags is reserved for future extensions and must be zero. */ if (flags != 0) return (-EINVAL); - /* Try to do it via zfs_clone_range() */ - ret = __zpl_clone_file_range(src_file, src_off, + /* Try to do it via zfs_clone_range() and allow shortening. */ + ret = zpl_clone_file_range_impl(src_file, src_off, dst_file, dst_off, len); #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE @@ -137,6 +136,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the * range in both files and if they're the same, arrange for them to be backed * by the same storage. + * + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range + * if we want. It's designed for filesystems that may need to shorten the + * length for alignment, EOF, or any other requirement. ZFS may shorten the + * request when there is outstanding dirty data which hasn't been written. */ loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, @@ -145,24 +149,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off, if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) return (-EINVAL); - /* - * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given - * range if we want. Its designed for filesystems that make data past - * EOF available, and don't want it to be visible in both files. ZFS - * doesn't do that, so we just turn the flag off. - */ - flags &= ~REMAP_FILE_CAN_SHORTEN; - + /* No support for dedup yet */ if (flags & REMAP_FILE_DEDUP) - /* No support for dedup yet */ return (-EOPNOTSUPP); /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_REMAP_FILE_RANGE */ @@ -179,8 +180,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off, if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + /* The entire length must be cloned or this is an error. */ + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ @@ -214,8 +221,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg) size_t len = i_size_read(file_inode(src_file)); - ssize_t ret = - __zpl_clone_file_range(src_file, 0, dst_file, 0, len); + ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len); fput(src_file); @@ -253,7 +259,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) if (len == 0) len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; - ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, + ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset, dst_file, fcr.fcr_dest_offset, len); fput(src_file); diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index c8ff7b6432fd..7f39ad6fc775 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -58,6 +58,26 @@ #include <sys/zfs_vfsops.h> #include <sys/zfs_znode.h> +/* + * Enable the experimental block cloning feature. If this setting is 0, then + * even if feature@block_cloning is enabled, attempts to clone blocks will act + * as though the feature is disabled. + */ +int zfs_bclone_enabled = 1; + +/* + * When set zfs_clone_range() waits for dirty data to be written to disk. + * This allows the clone operation to reliably succeed when a file is modified + * and then immediately cloned. For small files this may be slower than making + * a copy of the file and is therefore not the default. However, in certain + * scenarios this behavior may be desirable so a tunable is provided. + */ +static int zfs_bclone_wait_dirty = 0; + +/* + * Maximum bytes to read per chunk in zfs_read(). + */ +static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) @@ -182,8 +202,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } -static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ - /* * Read bytes from specified file into supplied buffer. * @@ -1049,6 +1067,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, size_t maxblocks, nbps; uint_t inblksz; uint64_t clear_setid_bits_txg = 0; + uint64_t last_synced_txg = 0; inoff = *inoffp; outoff = *outoffp; @@ -1287,15 +1306,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } nbps = maxblocks; + last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { /* * If we are trying to clone a block that was created - * in the current transaction group, error will be - * EAGAIN here, which we can just return to the caller - * so it can fallback if it likes. + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool(inos), + last_synced_txg + 1); + continue; + } + break; } @@ -1517,3 +1544,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, + "Enable block cloning"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, + "Wait for dirty blocks when cloning"); diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 7e0990b5d9f9..502b4de2bae9 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -536,7 +536,8 @@ tags = ['functional', 'cli_root', 'zpool_split'] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', - 'zpool_status_007_pos', 'zpool_status_features_001_pos'] + 'zpool_status_007_pos', 'zpool_status_008_pos', + 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] @@ -631,7 +632,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', tags = ['functional', 'compression'] [tests/functional/cp_files] -tests = ['cp_files_001_pos', 'cp_stress'] +tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/crtime] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index ae4aa6275465..edfdd47ee6d7 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'): 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], + 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], @@ -312,6 +313,7 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], + 'cp_files/cp_files_002_pos': ['SKIP', cfr_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], 'fault/auto_replace_001_pos': ['FAIL', 14851], 'fault/auto_spare_002_pos': ['FAIL', 11889], diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index e4e380aa7fd5..a619b846dd11 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -94,6 +94,7 @@ VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled +BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index 4040e60434a7..01af258d59fe 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -1239,6 +1239,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ @@ -1394,6 +1395,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ + functional/cp_files/cp_files_002_pos.ksh \ functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh index 3bdd7db649f9..d6f32cdc7ac6 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh @@ -51,7 +51,7 @@ else fi set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \ - "-vx $testpool" + "-vx $testpool" "-e $testpool" "-es $testpool" log_assert "Executing 'zpool status' with correct options succeeds" @@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do (( i = i + 1 )) done +cleanup + log_pass "'zpool status' with correct options succeeded" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh index b501aac5ad6d..52b22dd833f0 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh @@ -37,6 +37,7 @@ # 3. Read the file # 4. Take a snapshot and make a clone # 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' +# and 'zpool status -ev' function cleanup { @@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'" +log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'" log_mustnot eval "zpool status -v | grep '$TESTFS1'" log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh new file mode 100755 index 000000000000..6be2ad5a7410 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool status -e' only shows unhealthy devices. +# +# STRATEGY: +# 1. Create zpool +# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs +# 3. Verify vdevs are reported correctly with -e and -s +# 4. Verify parents are reported as DEGRADED +# 5. Verify healthy children are not reported +# + +function cleanup +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + zinject -c all + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 + log_must rm -f $all_vdevs +} + +log_assert "Verify 'zpool status -e'" + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/vdev{1..6}) +log_must mkdir -p $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) + +for raid_type in "draid2:3d:6c:1s" "raidz2"; do + + log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs + + # Check DEGRADED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE" + log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED" + + # Check FAULTED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE" + log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED" + + # Check no ONLINE vdevs are shown + log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" + + # Check no ONLINE slow vdevs are show. Then mark IOs greater than + # 10ms slow, delay IOs 20ms to vdev6, check slow IOs. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" + + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2 + log_must mkfile 1048576 /$TESTPOOL2/testfile + sync_pool $TESTPOOL2 + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + + # Check vdev6 slow IOs are only shown when requested with -s. + log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + + # Pool level and top-vdev level status must be DEGRADED. + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED" + log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED" + + # Check that healthy vdevs[1-3] aren't shown with -e. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" + + log_must zinject -c all + log_must zpool status -es $TESTPOOL2 + + zpool destroy $TESTPOOL2 +done + +log_pass "Verify zpool status -e shows only unhealthy vdevs" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh new file mode 100755 index 000000000000..60817449ab03 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -0,0 +1,161 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +# +# DESCRIPTION: +# Verify all cp --reflink modes work with modified file. +# +# STRATEGY: +# 1. Verify "cp --reflink=never|auto|always" behaves as expected. +# Two different modes of operation are tested. +# +# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL +# when there are dirty blocks which cannot be immediately cloned. +# This is the default behavior. +# +# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for +# dirty blocks to be written to disk allowing the clone to succeed. +# The downside to this is it may be slow which depending on the +# situtation may defeat the point of making a clone. +# + +verify_runnable "global" +verify_block_cloning + +if ! is_linux; then + log_unsupported "cp --reflink is a GNU coreutils option" +fi + +function cleanup +{ + datasetexists $TESTPOOL/cp-reflink && \ + destroy_dataset $$TESTPOOL/cp-reflink -f + log_must set_tunable32 BCLONE_WAIT_DIRTY 0 +} + +function verify_copy +{ + src_cksum=$(sha256digest $1) + dst_cksum=$(sha256digest $2) + + if [[ "$src_cksum" != "$dst_cksum" ]]; then + log_must ls -l $CP_TESTDIR + log_fail "checksum mismatch ($src_cksum != $dst_cksum)" + fi +} + +log_assert "Verify all cp --reflink modes work with modified file" + +log_onexit cleanup + +SRC_FILE=src.data +DST_FILE=dst.data +SRC_SIZE=$(($RANDOM % 2048)) + +# A smaller recordsize is used merely to speed up the test. +RECORDSIZE=4096 + +log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink +CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink) + +log_must cd $CP_TESTDIR + +# Never wait on dirty blocks (zfs_bclone_wait_dirty=0) +log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $SRC_FILE $DST_FILE +done + +# Wait on dirty blocks (zfs_bclone_wait_dirty=1) +log_must set_tunable32 BCLONE_WAIT_DIRTY 1 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $SRC_FILE $DST_FILE +done + +log_pass diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index 0c7cff4c796e..f452cffa20c8 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1152,7 +1152,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.99-333-FreeBSD_g2e6b3c4d9" +#define ZFS_META_ALIAS "zfs-2.2.99-338-FreeBSD_g229b9f4ed" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1182,7 +1182,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "333-FreeBSD_g2e6b3c4d9" +#define ZFS_META_RELEASE "338-FreeBSD_g229b9f4ed" /* Define the project version. */ #define ZFS_META_VERSION "2.2.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 5e86c5ebf6d0..04ced657e728 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.99-333-g2e6b3c4d9" +#define ZFS_META_GITREV "zfs-2.2.99-338-g229b9f4ed" |