aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2021-03-21 00:46:08 +0000
committerMartin Matuska <mm@FreeBSD.org>2021-03-21 01:17:59 +0000
commitf9693bef8dc83284e7ac905adc346f7d866b5245 (patch)
treec65ebf73ca3851248d9d03a93ce731f4fc5ddecd /sys/contrib/openzfs/module
parent815209920f1d024ca55270c106565a0b770a8c00 (diff)
parent48a1c304e82e33d5a3dd722a6ef4519dd998614b (diff)
downloadsrc-f9693bef8dc83284e7ac905adc346f7d866b5245.tar.gz
src-f9693bef8dc83284e7ac905adc346f7d866b5245.zip
zfs: merge OpenZFS master-891568c99
Notable upstream pull request merges: #11652 Split dmu_zfetch() speculation and execution parts #11682 Fix zfs_get_data access to files with wrong generation #11735 Clean up RAIDZ/DRAID ereport code #11737 Initialize metaslab range trees in metaslab_init #11739 FreeBSD: make seqc asserts conditional on replay #11763 Allow setting bootfs property on pools with indirect vdevs #11767 FreeBSD: Fix memory leaks in kstats Obtained from: OpenZFS MFC after: 2 weeks
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c11
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c10
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/policy.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c5
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c3
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c5
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c51
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c52
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu.c35
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_zfetch.c250
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c149
-rw-r--r--sys/contrib/openzfs/module/zfs/refcount.c10
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid.c240
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect.c1
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_mirror.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c266
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_fm.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_fuid.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_log.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_vnops.c14
-rw-r--r--sys/contrib/openzfs/module/zfs/zil.c3
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c3
30 files changed, 433 insertions, 734 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
index 6bdef466c253..43ce358298b5 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
@@ -299,15 +299,10 @@ __kstat_create(const char *module, int instance, const char *name,
panic("Undefined kstat type %d\n", ksp->ks_type);
}
- if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL)
ksp->ks_data = NULL;
- } else {
+ else
ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
- if (ksp->ks_data == NULL) {
- kmem_free(ksp, sizeof (*ksp));
- ksp = NULL;
- }
- }
/*
* Some kstats use a module name like "zfs/poolname" to distinguish a
@@ -509,6 +504,8 @@ kstat_delete(kstat_t *ksp)
sysctl_ctx_free(&ksp->ks_sysctl_ctx);
ksp->ks_lock = NULL;
mutex_destroy(&ksp->ks_private_lock);
+ if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
+ kmem_free(ksp->ks_data, ksp->ks_data_size);
free(ksp, M_KSTAT);
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index 647c1463ba14..94124fdcf6c3 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -407,12 +407,6 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
&metaslab_preload_limit, 0,
"Max number of metaslabs per group to preload");
-/* refcount.c */
-extern int reference_tracking_enable;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
- &reference_tracking_enable, 0,
- "Track reference holders to refcount_t objects, used mostly by ZFS");
-
/* spa.c */
extern int zfs_ccw_retry_interval;
SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index d82e5f4dcf15..551a3cc8d1db 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -490,8 +490,8 @@ abd_alloc_zero_scatter(void)
#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
#endif
-#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
-#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
+#define zfs_kmap_atomic(chunk) ((void *)chunk)
+#define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0)
#define local_irq_save(flags) do { (void)(flags); } while (0)
#define local_irq_restore(flags) do { (void)(flags); } while (0)
#define nth_page(pg, i) \
@@ -879,8 +879,7 @@ abd_iter_map(struct abd_iter *aiter)
aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
- paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
- km_table[aiter->iter_km]);
+ paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg));
}
aiter->iter_mapaddr = (char *)paddr + offset;
@@ -899,8 +898,7 @@ abd_iter_unmap(struct abd_iter *aiter)
if (!abd_is_linear(aiter->iter_abd)) {
/* LINTED E_FUNC_SET_NOT_USED */
- zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
- km_table[aiter->iter_km]);
+ zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset);
}
ASSERT3P(aiter->iter_mapaddr, !=, NULL);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
index 8780d7f6c70a..bbccb2e572d9 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -124,7 +124,7 @@ secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
if (crgetfsuid(cr) == owner)
return (0);
- if (inode_owner_or_capable(ip))
+ if (zpl_inode_owner_or_capable(kcred->user_ns, ip))
return (0);
#if defined(CONFIG_USER_NS)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index ff71ef4cd065..c56fd3a6ff21 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -589,9 +589,14 @@ retry:
}
/* bio_alloc() with __GFP_WAIT never returns NULL */
+#ifdef HAVE_BIO_MAX_SEGS
+ dr->dr_bio[i] = bio_alloc(GFP_NOIO, bio_max_segs(
+ abd_nr_pages_off(zio->io_abd, bio_size, abd_offset)));
+#else
dr->dr_bio[i] = bio_alloc(GFP_NOIO,
MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
BIO_MAX_PAGES));
+#endif
if (unlikely(dr->dr_bio[i] == NULL)) {
vdev_disk_dio_free(dr);
return (SET_ERROR(ENOMEM));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index a1668e46e4f9..d33188f3822c 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -590,7 +590,8 @@ struct inode *
zfsctl_root(znode_t *zp)
{
ASSERT(zfs_has_ctldir(zp));
- igrab(ZTOZSB(zp)->z_ctldir);
+ /* Must have an existing ref, so igrab() cannot return NULL */
+ VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL);
return (ZTOZSB(zp)->z_ctldir);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
index 3b0f824115f8..3e3fda20c72c 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -136,12 +136,12 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
void *paddr;
cnt = MIN(bv->bv_len - skip, n);
- paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1);
+ paddr = zfs_kmap_atomic(bv->bv_page);
if (rw == UIO_READ)
bcopy(p, paddr + bv->bv_offset + skip, cnt);
else
bcopy(paddr + bv->bv_offset + skip, p, cnt);
- zfs_kunmap_atomic(paddr, KM_USER1);
+ zfs_kunmap_atomic(paddr);
skip += cnt;
if (skip == bv->bv_len) {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index 3cc4b560e477..5d672af0e8aa 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -1734,7 +1734,11 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
0, kcred, NULL, NULL) == 0);
} else {
- igrab(*ipp);
+ /*
+ * Must have an existing ref, so igrab()
+ * cannot return NULL
+ */
+ VERIFY3P(igrab(*ipp), !=, NULL);
}
ZFS_EXIT(zfsvfs);
return (0);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index 84c33b541ea3..8aeed6f568cf 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -1656,7 +1656,8 @@ out:
*/
/* ARGSUSED */
int
-zfs_getattr_fast(struct inode *ip, struct kstat *sp)
+zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip,
+ struct kstat *sp)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
@@ -1668,7 +1669,7 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp)
mutex_enter(&zp->z_lock);
- generic_fillattr(ip, sp);
+ zpl_generic_fillattr(user_ns, ip, sp);
/*
* +1 link count for root inode with visible '.zfs' directory.
*/
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index e6420f19ed87..9b526afd0002 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -101,12 +101,22 @@ zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
*/
/* ARGSUSED */
static int
+#ifdef HAVE_USERNS_IOPS_GETATTR
+zpl_root_getattr_impl(struct user_namespace *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+#else
zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
+#endif
{
struct inode *ip = path->dentry->d_inode;
+#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR)
+ generic_fillattr(user_ns, ip, stat);
+#else
generic_fillattr(ip, stat);
+#endif
stat->atime = current_time(ip);
return (0);
@@ -290,8 +300,14 @@ zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
static int
+#ifdef HAVE_IOPS_RENAME_USERNS
+zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip,
+ struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
+ unsigned int flags)
+#else
zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+#endif
{
cred_t *cr = CRED();
int error;
@@ -309,7 +325,7 @@ zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
return (error);
}
-#ifndef HAVE_RENAME_WANTS_FLAGS
+#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS)
static int
zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
struct inode *tdip, struct dentry *tdentry)
@@ -333,7 +349,12 @@ zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
}
static int
+#ifdef HAVE_IOPS_MKDIR_USERNS
+zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip,
+ struct dentry *dentry, umode_t mode)
+#else
zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
+#endif
{
cred_t *cr = CRED();
vattr_t *vap;
@@ -363,14 +384,24 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
*/
/* ARGSUSED */
static int
+#ifdef HAVE_USERNS_IOPS_GETATTR
+zpl_snapdir_getattr_impl(struct user_namespace *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+#else
zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
+#endif
{
struct inode *ip = path->dentry->d_inode;
zfsvfs_t *zfsvfs = ITOZSB(ip);
ZPL_ENTER(zfsvfs);
+#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR)
+ generic_fillattr(user_ns, ip, stat);
+#else
generic_fillattr(ip, stat);
+#endif
stat->nlink = stat->size = 2;
stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
@@ -408,7 +439,7 @@ const struct file_operations zpl_fops_snapdir = {
const struct inode_operations zpl_ops_snapdir = {
.lookup = zpl_snapdir_lookup,
.getattr = zpl_snapdir_getattr,
-#ifdef HAVE_RENAME_WANTS_FLAGS
+#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
.rename = zpl_snapdir_rename2,
#else
.rename = zpl_snapdir_rename,
@@ -495,8 +526,14 @@ zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* ARGSUSED */
static int
+#ifdef HAVE_USERNS_IOPS_GETATTR
+zpl_shares_getattr_impl(struct user_namespace *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+#else
zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
+#endif
{
struct inode *ip = path->dentry->d_inode;
zfsvfs_t *zfsvfs = ITOZSB(ip);
@@ -506,7 +543,11 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
ZPL_ENTER(zfsvfs);
if (zfsvfs->z_shares_dir == 0) {
+#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR)
+ generic_fillattr(user_ns, path->dentry->d_inode, stat);
+#else
generic_fillattr(path->dentry->d_inode, stat);
+#endif
stat->nlink = stat->size = 2;
stat->atime = current_time(ip);
ZPL_EXIT(zfsvfs);
@@ -515,7 +556,11 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
if (error == 0) {
- error = -zfs_getattr_fast(ZTOI(dzp), stat);
+#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR)
+ error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat);
+#else
+ error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat);
+#endif
iput(ZTOI(dzp));
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index 970db4a8b73a..ea6993ffa4b0 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -869,7 +869,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
!capable(CAP_LINUX_IMMUTABLE))
return (-EACCES);
- if (!inode_owner_or_capable(ip))
+ if (!zpl_inode_owner_or_capable(kcred->user_ns, ip))
return (-EACCES);
xva_init(xva);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
index e79d334edc9b..cf0eab3e8c90 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -128,7 +128,12 @@ zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr)
}
static int
+#ifdef HAVE_IOPS_CREATE_USERNS
+zpl_create(struct user_namespace *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool flag)
+#else
zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag)
+#endif
{
cred_t *cr = CRED();
znode_t *zp;
@@ -163,7 +168,12 @@ zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag)
}
static int
+#ifdef HAVE_IOPS_MKNOD_USERNS
+zpl_mknod(struct user_namespace *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode,
+#else
zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+#endif
dev_t rdev)
{
cred_t *cr = CRED();
@@ -278,7 +288,12 @@ zpl_unlink(struct inode *dir, struct dentry *dentry)
}
static int
+#ifdef HAVE_IOPS_MKDIR_USERNS
+zpl_mkdir(struct user_namespace *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+#else
zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+#endif
{
cred_t *cr = CRED();
vattr_t *vap;
@@ -338,8 +353,14 @@ zpl_rmdir(struct inode *dir, struct dentry *dentry)
}
static int
+#ifdef HAVE_USERNS_IOPS_GETATTR
+zpl_getattr_impl(struct user_namespace *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+#else
zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
unsigned int query_flags)
+#endif
{
int error;
fstrans_cookie_t cookie;
@@ -350,7 +371,11 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
* XXX request_mask and query_flags currently ignored.
*/
- error = -zfs_getattr_fast(path->dentry->d_inode, stat);
+#ifdef HAVE_USERNS_IOPS_GETATTR
+ error = -zfs_getattr_fast(user_ns, path->dentry->d_inode, stat);
+#else
+ error = -zfs_getattr_fast(kcred->user_ns, path->dentry->d_inode, stat);
+#endif
spl_fstrans_unmark(cookie);
ASSERT3S(error, <=, 0);
@@ -359,7 +384,12 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
ZPL_GETATTR_WRAPPER(zpl_getattr);
static int
+#ifdef HAVE_SETATTR_PREPARE_USERNS
+zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry,
+ struct iattr *ia)
+#else
zpl_setattr(struct dentry *dentry, struct iattr *ia)
+#endif
{
struct inode *ip = dentry->d_inode;
cred_t *cr = CRED();
@@ -367,7 +397,7 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
int error;
fstrans_cookie_t cookie;
- error = setattr_prepare(dentry, ia);
+ error = zpl_setattr_prepare(kcred->user_ns, dentry, ia);
if (error)
return (error);
@@ -399,8 +429,14 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
}
static int
+#ifdef HAVE_IOPS_RENAME_USERNS
+zpl_rename2(struct user_namespace *user_ns, struct inode *sdip,
+ struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
+ unsigned int flags)
+#else
zpl_rename2(struct inode *sdip, struct dentry *sdentry,
struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+#endif
{
cred_t *cr = CRED();
int error;
@@ -421,7 +457,7 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry,
return (error);
}
-#ifndef HAVE_RENAME_WANTS_FLAGS
+#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS)
static int
zpl_rename(struct inode *sdip, struct dentry *sdentry,
struct inode *tdip, struct dentry *tdentry)
@@ -431,7 +467,12 @@ zpl_rename(struct inode *sdip, struct dentry *sdentry,
#endif
static int
+#ifdef HAVE_IOPS_SYMLINK_USERNS
+zpl_symlink(struct user_namespace *user_ns, struct inode *dir,
+ struct dentry *dentry, const char *name)
+#else
zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+#endif
{
cred_t *cr = CRED();
vattr_t *vap;
@@ -593,7 +634,8 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
crhold(cr);
ip->i_ctime = current_time(ip);
- igrab(ip); /* Use ihold() if available */
+ /* Must have an existing ref, so igrab() cannot return NULL */
+ VERIFY3P(igrab(ip), !=, NULL);
cookie = spl_fstrans_mark();
error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0);
@@ -677,7 +719,7 @@ const struct inode_operations zpl_dir_inode_operations = {
.mkdir = zpl_mkdir,
.rmdir = zpl_rmdir,
.mknod = zpl_mknod,
-#ifdef HAVE_RENAME_WANTS_FLAGS
+#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
.rename = zpl_rename2,
#else
.rename = zpl_rename,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
index 83812f2dcba8..971cd6ad031e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -1233,7 +1233,7 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name,
if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
return (-EOPNOTSUPP);
- if (!inode_owner_or_capable(ip))
+ if (!zpl_inode_owner_or_capable(kcred->user_ns, ip))
return (-EPERM);
if (value) {
@@ -1273,7 +1273,7 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name,
if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
return (-EOPNOTSUPP);
- if (!inode_owner_or_capable(ip))
+ if (!zpl_inode_owner_or_capable(kcred->user_ns, ip))
return (-EPERM);
if (value) {
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index a6cdc017cd21..d48dc7943a24 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -1640,7 +1640,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_exit(&db->db_mtx);
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- flags & DB_RF_HAVESTRUCT);
+ B_FALSE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
@@ -1662,6 +1662,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
if (!err && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ db->db_state != DB_CACHED,
flags & DB_RF_HAVESTRUCT);
}
@@ -1691,7 +1692,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_exit(&db->db_mtx);
if (prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- flags & DB_RF_HAVESTRUCT);
+ B_TRUE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses);
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index b46bf60d1a29..1c47430953b1 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -497,10 +497,12 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
dmu_buf_t **dbp;
+ zstream_t *zs = NULL;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
zio_t *zio = NULL;
+ boolean_t missed = B_FALSE;
ASSERT(length <= DMU_MAX_ACCESS);
@@ -536,9 +538,21 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
+ if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
+ DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+ /*
+ * Prepare the zfetch before initiating the demand reads, so
+ * that if multiple threads block on same indirect block, we
+ * base predictions on the original less racy request order.
+ */
+ zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
+ read && DNODE_IS_CACHEABLE(dn), B_TRUE);
+ }
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
+ if (zs)
+ dmu_zfetch_run(zs, missed, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
if (read)
@@ -546,20 +560,27 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
return (SET_ERROR(EIO));
}
- /* initiate async i/o */
- if (read)
+ /*
+ * Initiate async demand data read.
+ * We check the db_state after calling dbuf_read() because
+ * (1) dbuf_read() may change the state to CACHED due to a
+ * hit in the ARC, and (2) on a cache miss, a child will
+ * have been added to "zio" but not yet completed, so the
+ * state will not yet be CACHED.
+ */
+ if (read) {
(void) dbuf_read(db, zio, dbuf_flags);
+ if (db->db_state != DB_CACHED)
+ missed = B_TRUE;
+ }
dbp[i] = &db->db;
}
if (!read)
zfs_racct_write(length, nblks);
- if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
- DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
- dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
- read && DNODE_IS_CACHEABLE(dn), B_TRUE);
- }
+ if (zs)
+ dmu_zfetch_run(zs, missed, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);
if (read) {
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 5d061fe3813e..3d7407016d2c 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -59,8 +59,6 @@ typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_max_streams;
- kstat_named_t zfetchstat_max_completion_us;
- kstat_named_t zfetchstat_last_completion_us;
kstat_named_t zfetchstat_io_issued;
} zfetch_stats_t;
@@ -68,8 +66,6 @@ static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "max_streams", KSTAT_DATA_UINT64 },
- { "max_completion_us", KSTAT_DATA_UINT64 },
- { "last_completion_us", KSTAT_DATA_UINT64 },
{ "io_issued", KSTAT_DATA_UINT64 },
};
@@ -129,7 +125,7 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
static void
dmu_zfetch_stream_fini(zstream_t *zs)
{
- mutex_destroy(&zs->zs_lock);
+ ASSERT(!list_link_active(&zs->zs_node));
kmem_free(zs, sizeof (*zs));
}
@@ -138,17 +134,10 @@ dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
{
ASSERT(MUTEX_HELD(&zf->zf_lock));
list_remove(&zf->zf_stream, zs);
- dmu_zfetch_stream_fini(zs);
- zf->zf_numstreams--;
-}
-
-static void
-dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
-{
- ASSERT(MUTEX_HELD(&zf->zf_lock));
- list_remove(&zf->zf_stream, zs);
- zs->zs_fetch = NULL;
zf->zf_numstreams--;
+ membar_producer();
+ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
+ dmu_zfetch_stream_fini(zs);
}
/*
@@ -161,12 +150,8 @@ dmu_zfetch_fini(zfetch_t *zf)
zstream_t *zs;
mutex_enter(&zf->zf_lock);
- while ((zs = list_head(&zf->zf_stream)) != NULL) {
- if (zfs_refcount_count(&zs->zs_blocks) != 0)
- dmu_zfetch_stream_orphan(zf, zs);
- else
- dmu_zfetch_stream_remove(zf, zs);
- }
+ while ((zs = list_head(&zf->zf_stream)) != NULL)
+ dmu_zfetch_stream_remove(zf, zs);
mutex_exit(&zf->zf_lock);
list_destroy(&zf->zf_stream);
mutex_destroy(&zf->zf_lock);
@@ -195,9 +180,9 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
zs != NULL; zs = zs_next) {
zs_next = list_next(&zf->zf_stream, zs);
/*
- * Skip gethrtime() call if there are still references
+ * Skip if still active. 1 -- zf_stream reference.
*/
- if (zfs_refcount_count(&zs->zs_blocks) != 0)
+ if (zfs_refcount_count(&zs->zs_refs) != 1)
continue;
if (((now - zs->zs_atime) / NANOSEC) >
zfetch_min_sec_reap)
@@ -222,12 +207,17 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
zs->zs_blkid = blkid;
+ zs->zs_pf_blkid1 = blkid;
zs->zs_pf_blkid = blkid;
+ zs->zs_ipf_blkid1 = blkid;
zs->zs_ipf_blkid = blkid;
zs->zs_atime = now;
zs->zs_fetch = zf;
- zfs_refcount_create(&zs->zs_blocks);
- mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
+ zs->zs_missed = B_FALSE;
+ zfs_refcount_create(&zs->zs_callers);
+ zfs_refcount_create(&zs->zs_refs);
+ /* One reference for zf_stream. */
+ zfs_refcount_add(&zs->zs_refs, NULL);
zf->zf_numstreams++;
list_insert_head(&zf->zf_stream, zs);
}
@@ -237,48 +227,36 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
{
zstream_t *zs = arg;
- if (zs->zs_start_time && io_issued) {
- hrtime_t now = gethrtime();
- hrtime_t delta = NSEC2USEC(now - zs->zs_start_time);
-
- zs->zs_start_time = 0;
- ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta);
- if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us))
- ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
- }
-
- if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
- return;
-
- /*
- * The parent fetch structure has gone away
- */
- if (zs->zs_fetch == NULL)
+ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
dmu_zfetch_stream_fini(zs);
}
/*
- * This is the predictive prefetch entry point. It associates dnode access
- * specified with blkid and nblks arguments with prefetch stream, predicts
- * further accesses based on that stats and initiates speculative prefetch.
+ * This is the predictive prefetch entry point. dmu_zfetch_prepare()
+ * associates dnode access specified with blkid and nblks arguments with
+ * prefetch stream, predicts further accesses based on that stats and returns
+ * the stream pointer on success. That pointer must later be passed to
+ * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
+ * release it. dmu_zfetch() is a wrapper for simple cases when window between
+ * prediction and prefetch initiation is not needed.
* fetch_data argument specifies whether actual data blocks should be fetched:
* FALSE -- prefetch only indirect blocks for predicted data blocks;
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
*/
-void
-dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
- boolean_t have_lock)
+zstream_t *
+dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
+ boolean_t fetch_data, boolean_t have_lock)
{
zstream_t *zs;
- int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+ int64_t pf_start, ipf_start;
int64_t pf_ahead_blks, max_blks;
- int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
- uint64_t end_of_access_blkid;
+ int max_dist_blks, pf_nblks, ipf_nblks;
+ uint64_t end_of_access_blkid, maxblkid;
end_of_access_blkid = blkid + nblks;
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
if (zfs_prefetch_disable)
- return;
+ return (NULL);
/*
* If we haven't yet loaded the indirect vdevs' mappings, we
* can only read from blocks that we carefully ensure are on
@@ -287,14 +265,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
* blocks (e.g. of the MOS's dnode object).
*/
if (!spa_indirect_vdevs_loaded(spa))
- return;
+ return (NULL);
/*
* As a fast path for small (single-block) files, ignore access
* to the first block.
*/
if (!have_lock && blkid == 0)
- return;
+ return (NULL);
if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
@@ -303,10 +281,11 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
* A fast path for small files for which no prefetch will
* happen.
*/
- if (zf->zf_dnode->dn_maxblkid < 2) {
+ maxblkid = zf->zf_dnode->dn_maxblkid;
+ if (maxblkid < 2) {
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
- return;
+ return (NULL);
}
mutex_enter(&zf->zf_lock);
@@ -317,45 +296,47 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
*/
for (zs = list_head(&zf->zf_stream); zs != NULL;
zs = list_next(&zf->zf_stream, zs)) {
- if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
- mutex_enter(&zs->zs_lock);
- /*
- * zs_blkid could have changed before we
- * acquired zs_lock; re-check them here.
- */
- if (blkid == zs->zs_blkid) {
- break;
- } else if (blkid + 1 == zs->zs_blkid) {
- blkid++;
- nblks--;
- if (nblks == 0) {
- /* Already prefetched this before. */
- mutex_exit(&zs->zs_lock);
- mutex_exit(&zf->zf_lock);
- if (!have_lock) {
- rw_exit(&zf->zf_dnode->
- dn_struct_rwlock);
- }
- return;
- }
- break;
- }
- mutex_exit(&zs->zs_lock);
+ if (blkid == zs->zs_blkid) {
+ break;
+ } else if (blkid + 1 == zs->zs_blkid) {
+ blkid++;
+ nblks--;
+ break;
}
}
+ /*
+ * If the file is ending, remove the matching stream if found.
+ * If not found then it is too late to create a new one now.
+ */
+ if (end_of_access_blkid >= maxblkid) {
+ if (zs != NULL)
+ dmu_zfetch_stream_remove(zf, zs);
+ mutex_exit(&zf->zf_lock);
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ return (NULL);
+ }
+
+ /* Exit if we already prefetched this block before. */
+ if (nblks == 0) {
+ mutex_exit(&zf->zf_lock);
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ return (NULL);
+ }
+
if (zs == NULL) {
/*
* This access is not part of any existing stream. Create
* a new stream for it.
*/
- ZFETCHSTAT_BUMP(zfetchstat_misses);
-
dmu_zfetch_stream_create(zf, end_of_access_blkid);
mutex_exit(&zf->zf_lock);
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
- return;
+ ZFETCHSTAT_BUMP(zfetchstat_misses);
+ return (NULL);
}
/*
@@ -369,6 +350,10 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
* start just after the block we just accessed.
*/
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
+ if (zs->zs_pf_blkid1 < end_of_access_blkid)
+ zs->zs_pf_blkid1 = end_of_access_blkid;
+ if (zs->zs_ipf_blkid1 < end_of_access_blkid)
+ zs->zs_ipf_blkid1 = end_of_access_blkid;
/*
* Double our amount of prefetched data, but don't let the
@@ -407,49 +392,108 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
* (i.e. the amount read now + the amount of data prefetched now).
*/
pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
- max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+ max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid);
ipf_nblks = MIN(pf_ahead_blks, max_blks);
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
- epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
- ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
- ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
-
- zs->zs_atime = gethrtime();
- /* no prior reads in progress */
- if (zfs_refcount_count(&zs->zs_blocks) == 0)
- zs->zs_start_time = zs->zs_atime;
zs->zs_blkid = end_of_access_blkid;
- zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
- NULL);
- mutex_exit(&zs->zs_lock);
+ /* Protect the stream from reclamation. */
+ zs->zs_atime = gethrtime();
+ zfs_refcount_add(&zs->zs_refs, NULL);
+ /* Count concurrent callers. */
+ zfs_refcount_add(&zs->zs_callers, NULL);
mutex_exit(&zf->zf_lock);
- issued = 0;
+
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+
+ ZFETCHSTAT_BUMP(zfetchstat_hits);
+ return (zs);
+}
+
+void
+dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
+{
+ zfetch_t *zf = zs->zs_fetch;
+ int64_t pf_start, pf_end, ipf_start, ipf_end;
+ int epbs, issued;
+
+ if (missed)
+ zs->zs_missed = missed;
/*
- * dbuf_prefetch() is asynchronous (even when it needs to read
- * indirect blocks), but we still prefer to drop our locks before
- * calling it to reduce the time we hold them.
+ * Postpone the prefetch if there are more concurrent callers.
+ * It happens when multiple requests are waiting for the same
+ * indirect block. The last one will run the prefetch for all.
*/
+ if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
+ /* Drop reference taken in dmu_zfetch_prepare(). */
+ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
+ dmu_zfetch_stream_fini(zs);
+ return;
+ }
- for (int i = 0; i < pf_nblks; i++) {
- issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
+ mutex_enter(&zf->zf_lock);
+ if (zs->zs_missed) {
+ pf_start = zs->zs_pf_blkid1;
+ pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
+ } else {
+ pf_start = pf_end = 0;
+ }
+ ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1);
+ ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid;
+ mutex_exit(&zf->zf_lock);
+ ASSERT3S(pf_start, <=, pf_end);
+ ASSERT3S(ipf_start, <=, ipf_end);
+
+ epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+ ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
+ ASSERT3S(ipf_start, <=, ipf_end);
+ issued = pf_end - pf_start + ipf_end - ipf_start;
+ if (issued > 1) {
+ /* More references on top of taken in dmu_zfetch_prepare(). */
+ zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL);
+ } else if (issued == 0) {
+ /* Some other thread has done our work, so drop the ref. */
+ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
+ dmu_zfetch_stream_fini(zs);
+ return;
+ }
+
+ if (!have_lock)
+ rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
+
+ issued = 0;
+ for (int64_t blk = pf_start; blk < pf_end; blk++) {
+ issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
dmu_zfetch_stream_done, zs);
}
- for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+ for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
dmu_zfetch_stream_done, zs);
}
+
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
- ZFETCHSTAT_BUMP(zfetchstat_hits);
if (issued)
ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
}
+void
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
+ boolean_t missed, boolean_t have_lock)
+{
+ zstream_t *zs;
+
+ zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
+ if (zs)
+ dmu_zfetch_run(zs, missed, have_lock);
+}
+
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
"Disable all ZFS prefetching");
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index bc4f007b61a1..463806c6078a 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -2316,18 +2316,13 @@ metaslab_load_impl(metaslab_t *msp)
range_tree_add(msp->ms_allocatable,
msp->ms_start, msp->ms_size);
- if (msp->ms_freed != NULL) {
+ if (msp->ms_new) {
/*
* If the ms_sm doesn't exist, this means that this
* metaslab hasn't gone through metaslab_sync() and
* thus has never been dirtied. So we shouldn't
* expect any unflushed allocs or frees from previous
* TXGs.
- *
- * Note: ms_freed and all the other trees except for
- * the ms_allocatable, can be NULL at this point only
- * if this is a new metaslab of a vdev that just got
- * expanded.
*/
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
@@ -2365,8 +2360,6 @@ metaslab_load_impl(metaslab_t *msp)
range_tree_walk(msp->ms_unflushed_frees,
range_tree_add, msp->ms_allocatable);
- msp->ms_loaded = B_TRUE;
-
ASSERT3P(msp->ms_group, !=, NULL);
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
if (spa_syncing_log_sm(spa) != NULL) {
@@ -2680,19 +2673,31 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
}
- range_seg_type_t type;
uint64_t shift, start;
- type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
+ range_seg_type_t type =
+ metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
- /*
- * We create the ms_allocatable here, but we don't create the
- * other range trees until metaslab_sync_done(). This serves
- * two purposes: it allows metaslab_sync_done() to detect the
- * addition of new space; and for debugging, it ensures that
- * we'd data fault on any attempt to use this metaslab before
- * it's ready.
- */
ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
+ for (int t = 0; t < TXG_SIZE; t++) {
+ ms->ms_allocating[t] = range_tree_create(NULL, type,
+ NULL, start, shift);
+ }
+ ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
+ start, shift);
+ }
+ ms->ms_checkpointing =
+ range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_unflushed_allocs =
+ range_tree_create(NULL, type, NULL, start, shift);
+
+ metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+ mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
+ mrap->mra_floor_shift = metaslab_by_size_min_shift;
+ ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
+ type, mrap, start, shift);
ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
@@ -2765,13 +2770,13 @@ metaslab_fini(metaslab_t *msp)
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
+
/*
- * If the range trees haven't been allocated, this metaslab hasn't
- * been through metaslab_sync_done() for the first time yet, so its
+ * If this metaslab hasn't been through metaslab_sync_done() yet its
* space hasn't been accounted for in its vdev and doesn't need to be
* subtracted.
*/
- if (msp->ms_freed != NULL) {
+ if (!msp->ms_new) {
metaslab_space_update(vd, mg->mg_class,
-metaslab_allocated_space(msp), 0, -msp->ms_size);
@@ -2782,27 +2787,24 @@ metaslab_fini(metaslab_t *msp)
metaslab_unload(msp);
range_tree_destroy(msp->ms_allocatable);
+ range_tree_destroy(msp->ms_freeing);
+ range_tree_destroy(msp->ms_freed);
- if (msp->ms_freed != NULL) {
- range_tree_destroy(msp->ms_freeing);
- range_tree_destroy(msp->ms_freed);
+ ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+ metaslab_unflushed_changes_memused(msp));
+ spa->spa_unflushed_stats.sus_memused -=
+ metaslab_unflushed_changes_memused(msp);
+ range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_allocs);
+ range_tree_destroy(msp->ms_checkpointing);
+ range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+ range_tree_destroy(msp->ms_unflushed_frees);
- ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
- metaslab_unflushed_changes_memused(msp));
- spa->spa_unflushed_stats.sus_memused -=
- metaslab_unflushed_changes_memused(msp);
- range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
- range_tree_destroy(msp->ms_unflushed_allocs);
- range_tree_destroy(msp->ms_checkpointing);
- range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
- range_tree_destroy(msp->ms_unflushed_frees);
-
- for (int t = 0; t < TXG_SIZE; t++) {
- range_tree_destroy(msp->ms_allocating[t]);
- }
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_destroy(msp->ms_defer[t]);
- }
+ for (int t = 0; t < TXG_SIZE; t++) {
+ range_tree_destroy(msp->ms_allocating[t]);
+ }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_destroy(msp->ms_defer[t]);
}
ASSERT0(msp->ms_deferspace);
@@ -3926,17 +3928,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* This metaslab has just been added so there's no work to do now.
*/
- if (msp->ms_freeing == NULL) {
- ASSERT3P(alloctree, ==, NULL);
+ if (msp->ms_new) {
+ ASSERT0(range_tree_space(alloctree));
+ ASSERT0(range_tree_space(msp->ms_freeing));
+ ASSERT0(range_tree_space(msp->ms_freed));
+ ASSERT0(range_tree_space(msp->ms_checkpointing));
+ ASSERT0(range_tree_space(msp->ms_trim));
return;
}
- ASSERT3P(alloctree, !=, NULL);
- ASSERT3P(msp->ms_freeing, !=, NULL);
- ASSERT3P(msp->ms_freed, !=, NULL);
- ASSERT3P(msp->ms_checkpointing, !=, NULL);
- ASSERT3P(msp->ms_trim, !=, NULL);
-
/*
* Normally, we don't want to process a metaslab if there are no
* allocations or frees to perform. However, if the metaslab is being
@@ -4240,54 +4240,15 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
mutex_enter(&msp->ms_lock);
- /*
- * If this metaslab is just becoming available, initialize its
- * range trees and add its capacity to the vdev.
- */
- if (msp->ms_freed == NULL) {
- range_seg_type_t type;
- uint64_t shift, start;
- type = metaslab_calculate_range_tree_type(vd, msp, &start,
- &shift);
-
- for (int t = 0; t < TXG_SIZE; t++) {
- ASSERT(msp->ms_allocating[t] == NULL);
-
- msp->ms_allocating[t] = range_tree_create(NULL, type,
- NULL, start, shift);
- }
-
- ASSERT3P(msp->ms_freeing, ==, NULL);
- msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
- shift);
-
- ASSERT3P(msp->ms_freed, ==, NULL);
- msp->ms_freed = range_tree_create(NULL, type, NULL, start,
- shift);
-
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- ASSERT3P(msp->ms_defer[t], ==, NULL);
- msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
- start, shift);
- }
-
- ASSERT3P(msp->ms_checkpointing, ==, NULL);
- msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
- start, shift);
-
- ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
- msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
- start, shift);
-
- metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
- mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
- mrap->mra_floor_shift = metaslab_by_size_min_shift;
- ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
- msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
- type, mrap, start, shift);
-
+ if (msp->ms_new) {
+ /* this is a new metaslab, add its capacity to the vdev */
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
+
+ /* there should be no allocations nor frees at this point */
+ VERIFY0(msp->ms_allocated_this_txg);
+ VERIFY0(range_tree_space(msp->ms_freed));
}
+
ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_checkpointing));
diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c
index 39476261edfb..a3877b8d15f6 100644
--- a/sys/contrib/openzfs/module/zfs/refcount.c
+++ b/sys/contrib/openzfs/module/zfs/refcount.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2021 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -324,4 +324,12 @@ zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, ,reference_tracking_enable, INT, ZMOD_RW,
+ "Track reference holders to refcount_t objects");
+
+ZFS_MODULE_PARAM(zfs, ,reference_history, INT, ZMOD_RW,
+ "Maximum reference holders being tracked");
+/* END CSTYLED */
#endif /* ZFS_DEBUG */
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index ad4f3efb87b1..c536a1c6cda0 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -5105,10 +5105,8 @@ vdev_is_bootable(vdev_t *vd)
if (!vd->vdev_ops->vdev_op_leaf) {
const char *vdev_type = vd->vdev_ops->vdev_op_type;
- if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
- strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
+ if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
return (B_FALSE);
- }
}
for (int c = 0; c < vd->vdev_children; c++) {
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index a4f48cf744b0..fb2143e94689 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -632,236 +632,6 @@ vdev_draid_group_to_offset(vdev_t *vd, uint64_t group)
return (group * vdc->vdc_groupsz);
}
-
-static void
-vdev_draid_map_free_vsd(zio_t *zio)
-{
- raidz_map_t *rm = zio->io_vsd;
-
- ASSERT0(rm->rm_freed);
- rm->rm_freed = B_TRUE;
-
- if (rm->rm_reports == 0) {
- vdev_raidz_map_free(rm);
- }
-}
-
-/*ARGSUSED*/
-static void
-vdev_draid_cksum_free(void *arg, size_t ignored)
-{
- raidz_map_t *rm = arg;
-
- ASSERT3U(rm->rm_reports, >, 0);
-
- if (--rm->rm_reports == 0 && rm->rm_freed)
- vdev_raidz_map_free(rm);
-}
-
-static void
-vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
-{
- raidz_map_t *rm = zcr->zcr_cbdata;
- const size_t c = zcr->zcr_cbinfo;
- uint64_t skip_size = zcr->zcr_sector;
- uint64_t parity_size;
- size_t x, offset, size;
-
- if (good_data == NULL) {
- zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
- return;
- }
-
- /*
- * Detailed cksum reporting is currently only supported for single
- * row draid mappings, this covers the vast majority of zios. Only
- * a dRAID zio which spans groups will have multiple rows.
- */
- if (rm->rm_nrows != 1) {
- zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
- return;
- }
-
- raidz_row_t *rr = rm->rm_row[0];
- const abd_t *good = NULL;
- const abd_t *bad = rr->rr_col[c].rc_abd;
-
- if (c < rr->rr_firstdatacol) {
- /*
- * The first time through, calculate the parity blocks for
- * the good data (this relies on the fact that the good
- * data never changes for a given logical zio)
- */
- if (rr->rr_col[0].rc_gdata == NULL) {
- abd_t *bad_parity[VDEV_DRAID_MAXPARITY];
-
- /*
- * Set up the rr_col[]s to generate the parity for
- * good_data, first saving the parity bufs and
- * replacing them with buffers to hold the result.
- */
- for (x = 0; x < rr->rr_firstdatacol; x++) {
- bad_parity[x] = rr->rr_col[x].rc_abd;
- rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
- abd_alloc_sametype(rr->rr_col[x].rc_abd,
- rr->rr_col[x].rc_size);
- }
-
- /*
- * Fill in the data columns from good_data being
- * careful to pad short columns and empty columns
- * with a skip sector.
- */
- uint64_t good_size = abd_get_size((abd_t *)good_data);
-
- offset = 0;
- for (; x < rr->rr_cols; x++) {
- abd_free(rr->rr_col[x].rc_abd);
-
- if (offset == good_size) {
- /* empty data column (small write) */
- rr->rr_col[x].rc_abd =
- abd_get_zeros(skip_size);
- } else if (x < rr->rr_bigcols) {
- /* this is a "big column" */
- size = rr->rr_col[x].rc_size;
- rr->rr_col[x].rc_abd =
- abd_get_offset_size(
- (abd_t *)good_data, offset, size);
- offset += size;
- } else {
- /* short data column, add skip sector */
- size = rr->rr_col[x].rc_size -skip_size;
- rr->rr_col[x].rc_abd = abd_alloc(
- rr->rr_col[x].rc_size, B_TRUE);
- abd_copy_off(rr->rr_col[x].rc_abd,
- (abd_t *)good_data, 0, offset,
- size);
- abd_zero_off(rr->rr_col[x].rc_abd,
- size, skip_size);
- offset += size;
- }
- }
-
- /*
- * Construct the parity from the good data.
- */
- vdev_raidz_generate_parity_row(rm, rr);
-
- /* restore everything back to its original state */
- for (x = 0; x < rr->rr_firstdatacol; x++)
- rr->rr_col[x].rc_abd = bad_parity[x];
-
- offset = 0;
- for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
- abd_free(rr->rr_col[x].rc_abd);
- rr->rr_col[x].rc_abd = abd_get_offset_size(
- rr->rr_abd_copy, offset,
- rr->rr_col[x].rc_size);
- offset += rr->rr_col[x].rc_size;
- }
- }
-
- ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
- good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
- rr->rr_col[c].rc_size);
- } else {
- /* adjust good_data to point at the start of our column */
- parity_size = size = rr->rr_col[0].rc_size;
- if (c >= rr->rr_bigcols) {
- size -= skip_size;
- zcr->zcr_length = size;
- }
-
- /* empty column */
- if (size == 0) {
- zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE);
- return;
- }
-
- offset = 0;
- for (x = rr->rr_firstdatacol; x < c; x++) {
- if (x < rr->rr_bigcols) {
- offset += parity_size;
- } else {
- offset += parity_size - skip_size;
- }
- }
-
- good = abd_get_offset_size((abd_t *)good_data, offset, size);
- }
-
- /* we drop the ereport if it ends up that the data was good */
- zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
- abd_free((abd_t *)good);
-}
-
-/*
- * Invoked indirectly by zfs_ereport_start_checksum(), called
- * below when our read operation fails completely. The main point
- * is to keep a copy of everything we read from disk, so that at
- * vdev_draid_cksum_finish() time we can compare it with the good data.
- */
-static void
-vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
-{
- size_t c = (size_t)(uintptr_t)arg;
- raidz_map_t *rm = zio->io_vsd;
-
- /* set up the report and bump the refcount */
- zcr->zcr_cbdata = rm;
- zcr->zcr_cbinfo = c;
- zcr->zcr_finish = vdev_draid_cksum_finish;
- zcr->zcr_free = vdev_draid_cksum_free;
-
- rm->rm_reports++;
- ASSERT3U(rm->rm_reports, >, 0);
-
- if (rm->rm_row[0]->rr_abd_copy != NULL)
- return;
-
- /*
- * It's the first time we're called for this raidz_map_t, so we need
- * to copy the data aside; there's no guarantee that our zio's buffer
- * won't be re-used for something else.
- *
- * Our parity data is already in separate buffers, so there's no need
- * to copy them. Furthermore, all columns should have been expanded
- * by vdev_draid_map_alloc_empty() when attempting reconstruction.
- */
- for (int i = 0; i < rm->rm_nrows; i++) {
- raidz_row_t *rr = rm->rm_row[i];
- size_t offset = 0;
- size_t size = 0;
-
- for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
- ASSERT3U(rr->rr_col[c].rc_size, ==,
- rr->rr_col[0].rc_size);
- size += rr->rr_col[c].rc_size;
- }
-
- rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
-
- for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
- raidz_col_t *col = &rr->rr_col[c];
- abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
- offset, col->rc_size);
-
- abd_copy(tmp, col->rc_abd, col->rc_size);
- abd_free(col->rc_abd);
-
- col->rc_abd = tmp;
- offset += col->rc_size;
- }
- ASSERT3U(offset, ==, size);
- }
-}
-
-const zio_vsd_ops_t vdev_draid_vsd_ops = {
- .vsd_free = vdev_draid_map_free_vsd,
- .vsd_cksum_report = vdev_draid_cksum_report
-};
-
/*
* Full stripe writes. When writing, all columns (D+P) are required. Parity
* is calculated over all the columns, including empty zero filled sectors,
@@ -1208,7 +978,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
rr->rr_missingdata = 0;
rr->rr_missingparity = 0;
rr->rr_firstdatacol = vdc->vdc_nparity;
- rr->rr_abd_copy = NULL;
rr->rr_abd_empty = NULL;
#ifdef ZFS_DEBUG
rr->rr_offset = io_offset;
@@ -1230,7 +999,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
rc->rc_offset = physical_offset;
rc->rc_abd = NULL;
- rc->rc_gdata = NULL;
rc->rc_orig_data = NULL;
rc->rc_error = 0;
rc->rc_tried = 0;
@@ -1328,9 +1096,6 @@ vdev_draid_map_alloc(zio_t *zio)
if (nrows == 2)
rm->rm_row[1] = rr[1];
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_draid_vsd_ops;
-
return (rm);
}
@@ -2183,12 +1948,13 @@ static void
vdev_draid_io_start(zio_t *zio)
{
vdev_t *vd __maybe_unused = zio->io_vd;
- raidz_map_t *rm;
ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset));
- rm = vdev_draid_map_alloc(zio);
+ raidz_map_t *rm = vdev_draid_map_alloc(zio);
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
if (zio->io_type == ZIO_TYPE_WRITE) {
for (int i = 0; i < rm->rm_nrows; i++) {
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
index 416f4c54d8e8..bafb2c767b2e 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -315,7 +315,6 @@ vdev_indirect_map_free(zio_t *zio)
static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
.vsd_free = vdev_indirect_map_free,
- .vsd_cksum_report = zio_vsd_default_cksum_report
};
/*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
index 71ca43caec1a..f360a18c0041 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
@@ -174,7 +174,6 @@ vdev_mirror_map_free(zio_t *zio)
static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
.vsd_free = vdev_mirror_map_free,
- .vsd_cksum_report = zio_vsd_default_cksum_report
};
static int
@@ -379,8 +378,6 @@ vdev_mirror_map_init(zio_t *zio)
}
}
- zio->io_vsd = mm;
- zio->io_vsd_ops = &vdev_mirror_vsd_ops;
return (mm);
}
@@ -629,6 +626,8 @@ vdev_mirror_io_start(zio_t *zio)
int c, children;
mm = vdev_mirror_map_init(zio);
+ zio->io_vsd = mm;
+ zio->io_vsd_ops = &vdev_mirror_vsd_ops;
if (mm == NULL) {
ASSERT(!spa_trust_config(zio->io_spa));
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index 57a594c80ce3..db753ec16fd3 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -143,15 +143,10 @@ vdev_raidz_row_free(raidz_row_t *rr)
if (rc->rc_size != 0)
abd_free(rc->rc_abd);
- if (rc->rc_gdata != NULL)
- abd_free(rc->rc_gdata);
if (rc->rc_orig_data != NULL)
- zio_buf_free(rc->rc_orig_data, rc->rc_size);
+ abd_free(rc->rc_orig_data);
}
- if (rr->rr_abd_copy != NULL)
- abd_free(rr->rr_abd_copy);
-
if (rr->rr_abd_empty != NULL)
abd_free(rr->rr_abd_empty);
@@ -172,175 +167,11 @@ vdev_raidz_map_free_vsd(zio_t *zio)
{
raidz_map_t *rm = zio->io_vsd;
- ASSERT0(rm->rm_freed);
- rm->rm_freed = B_TRUE;
-
- if (rm->rm_reports == 0) {
- vdev_raidz_map_free(rm);
- }
-}
-
-/*ARGSUSED*/
-static void
-vdev_raidz_cksum_free(void *arg, size_t ignored)
-{
- raidz_map_t *rm = arg;
-
- ASSERT3U(rm->rm_reports, >, 0);
-
- if (--rm->rm_reports == 0 && rm->rm_freed)
- vdev_raidz_map_free(rm);
-}
-
-static void
-vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
-{
- raidz_map_t *rm = zcr->zcr_cbdata;
- const size_t c = zcr->zcr_cbinfo;
- size_t x, offset;
-
- if (good_data == NULL) {
- zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
- return;
- }
-
- ASSERT3U(rm->rm_nrows, ==, 1);
- raidz_row_t *rr = rm->rm_row[0];
-
- const abd_t *good = NULL;
- const abd_t *bad = rr->rr_col[c].rc_abd;
-
- if (c < rr->rr_firstdatacol) {
- /*
- * The first time through, calculate the parity blocks for
- * the good data (this relies on the fact that the good
- * data never changes for a given logical ZIO)
- */
- if (rr->rr_col[0].rc_gdata == NULL) {
- abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
-
- /*
- * Set up the rr_col[]s to generate the parity for
- * good_data, first saving the parity bufs and
- * replacing them with buffers to hold the result.
- */
- for (x = 0; x < rr->rr_firstdatacol; x++) {
- bad_parity[x] = rr->rr_col[x].rc_abd;
- rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
- abd_alloc_sametype(rr->rr_col[x].rc_abd,
- rr->rr_col[x].rc_size);
- }
-
- /* fill in the data columns from good_data */
- offset = 0;
- for (; x < rr->rr_cols; x++) {
- abd_free(rr->rr_col[x].rc_abd);
-
- rr->rr_col[x].rc_abd =
- abd_get_offset_size((abd_t *)good_data,
- offset, rr->rr_col[x].rc_size);
- offset += rr->rr_col[x].rc_size;
- }
-
- /*
- * Construct the parity from the good data.
- */
- vdev_raidz_generate_parity_row(rm, rr);
-
- /* restore everything back to its original state */
- for (x = 0; x < rr->rr_firstdatacol; x++)
- rr->rr_col[x].rc_abd = bad_parity[x];
-
- offset = 0;
- for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
- abd_free(rr->rr_col[x].rc_abd);
- rr->rr_col[x].rc_abd = abd_get_offset_size(
- rr->rr_abd_copy, offset,
- rr->rr_col[x].rc_size);
- offset += rr->rr_col[x].rc_size;
- }
- }
-
- ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
- good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
- rr->rr_col[c].rc_size);
- } else {
- /* adjust good_data to point at the start of our column */
- offset = 0;
- for (x = rr->rr_firstdatacol; x < c; x++)
- offset += rr->rr_col[x].rc_size;
-
- good = abd_get_offset_size((abd_t *)good_data, offset,
- rr->rr_col[c].rc_size);
- }
-
- /* we drop the ereport if it ends up that the data was good */
- zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
- abd_free((abd_t *)good);
-}
-
-/*
- * Invoked indirectly by zfs_ereport_start_checksum(), called
- * below when our read operation fails completely. The main point
- * is to keep a copy of everything we read from disk, so that at
- * vdev_raidz_cksum_finish() time we can compare it with the good data.
- */
-static void
-vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
-{
- size_t c = (size_t)(uintptr_t)arg;
- raidz_map_t *rm = zio->io_vsd;
-
- /* set up the report and bump the refcount */
- zcr->zcr_cbdata = rm;
- zcr->zcr_cbinfo = c;
- zcr->zcr_finish = vdev_raidz_cksum_finish;
- zcr->zcr_free = vdev_raidz_cksum_free;
-
- rm->rm_reports++;
- ASSERT3U(rm->rm_reports, >, 0);
- ASSERT3U(rm->rm_nrows, ==, 1);
-
- if (rm->rm_row[0]->rr_abd_copy != NULL)
- return;
-
- /*
- * It's the first time we're called for this raidz_map_t, so we need
- * to copy the data aside; there's no guarantee that our zio's buffer
- * won't be re-used for something else.
- *
- * Our parity data is already in separate buffers, so there's no need
- * to copy them.
- */
- for (int i = 0; i < rm->rm_nrows; i++) {
- raidz_row_t *rr = rm->rm_row[i];
- size_t offset = 0;
- size_t size = 0;
-
- for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
- size += rr->rr_col[c].rc_size;
-
- rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
-
- for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
- raidz_col_t *col = &rr->rr_col[c];
- abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
- offset, col->rc_size);
-
- abd_copy(tmp, col->rc_abd, col->rc_size);
-
- abd_free(col->rc_abd);
- col->rc_abd = tmp;
-
- offset += col->rc_size;
- }
- ASSERT3U(offset, ==, size);
- }
+ vdev_raidz_map_free(rm);
}
-static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+const zio_vsd_ops_t vdev_raidz_vsd_ops = {
.vsd_free = vdev_raidz_map_free_vsd,
- .vsd_cksum_report = vdev_raidz_cksum_report
};
/*
@@ -414,7 +245,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rr->rr_missingdata = 0;
rr->rr_missingparity = 0;
rr->rr_firstdatacol = nparity;
- rr->rr_abd_copy = NULL;
rr->rr_abd_empty = NULL;
rr->rr_nempty = 0;
#ifdef ZFS_DEBUG
@@ -435,7 +265,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
rc->rc_devidx = col;
rc->rc_offset = coff;
rc->rc_abd = NULL;
- rc->rc_gdata = NULL;
rc->rc_orig_data = NULL;
rc->rc_error = 0;
rc->rc_tried = 0;
@@ -831,7 +660,7 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
return (0);
}
-static int
+static void
vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
@@ -860,11 +689,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
(void) abd_iterate_func2(dst, src, 0, 0, size,
vdev_raidz_reconst_p_func, NULL);
}
-
- return (1 << VDEV_RAIDZ_P);
}
-static int
+static void
vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
@@ -905,11 +732,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
struct reconst_q_struct rq = { abd_to_buf(src), exp };
(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
vdev_raidz_reconst_q_post_func, &rq);
-
- return (1 << VDEV_RAIDZ_Q);
}
-static int
+static void
vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
@@ -995,8 +820,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
*/
rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
-
- return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
/* BEGIN CSTYLED */
@@ -1355,7 +1178,7 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
kmem_free(p, psize);
}
-static int
+static void
vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
{
int n, i, c, t, tt;
@@ -1370,8 +1193,6 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
abd_t **bufs = NULL;
- int code = 0;
-
/*
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
* temporary linear ABDs if any non-linear ABDs are found.
@@ -1426,15 +1247,10 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
continue;
}
- code |= 1 << c;
-
parity_map[i] = c;
i++;
}
- ASSERT(code != 0);
- ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
-
psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
nmissing_rows * n + sizeof (used[0]) * n;
p = kmem_alloc(psize, KM_SLEEP);
@@ -1497,18 +1313,15 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
}
kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
}
-
- return (code);
}
-static int
+static void
vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
const int *t, int nt)
{
int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
int ntgts;
int i, c, ret;
- int code;
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
@@ -1541,20 +1354,24 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
/* Reconstruct using the new math implementation */
ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
if (ret != RAIDZ_ORIGINAL_IMPL)
- return (ret);
+ return;
/*
* See if we can use any of our optimized reconstruction routines.
*/
switch (nbaddata) {
case 1:
- if (parity_valid[VDEV_RAIDZ_P])
- return (vdev_raidz_reconstruct_p(rr, dt, 1));
+ if (parity_valid[VDEV_RAIDZ_P]) {
+ vdev_raidz_reconstruct_p(rr, dt, 1);
+ return;
+ }
ASSERT(rr->rr_firstdatacol > 1);
- if (parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_q(rr, dt, 1));
+ if (parity_valid[VDEV_RAIDZ_Q]) {
+ vdev_raidz_reconstruct_q(rr, dt, 1);
+ return;
+ }
ASSERT(rr->rr_firstdatacol > 2);
break;
@@ -1563,18 +1380,17 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
ASSERT(rr->rr_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_P] &&
- parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_pq(rr, dt, 2));
+ parity_valid[VDEV_RAIDZ_Q]) {
+ vdev_raidz_reconstruct_pq(rr, dt, 2);
+ return;
+ }
ASSERT(rr->rr_firstdatacol > 2);
break;
}
- code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
- ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
- ASSERT(code > 0);
- return (code);
+ vdev_raidz_reconstruct_general(rr, tgts, ntgts);
}
static int
@@ -1811,10 +1627,11 @@ vdev_raidz_io_start(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
vdev_raidz_t *vdrz = vd->vdev_tsd;
- raidz_map_t *rm;
- rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
+ raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
vdrz->vd_logical_width, vdrz->vd_nparity);
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
/*
* Until raidz expansion is implemented all maps for a raidz vdev
@@ -1823,9 +1640,6 @@ vdev_raidz_io_start(zio_t *zio)
ASSERT3U(rm->rm_nrows, ==, 1);
raidz_row_t *rr = rm->rm_row[0];
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
-
if (zio->io_type == ZIO_TYPE_WRITE) {
vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
} else {
@@ -2021,7 +1835,7 @@ raidz_restore_orig_data(raidz_map_t *rm)
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_need_orig_restore) {
- abd_copy_from_buf(rc->rc_abd,
+ abd_copy(rc->rc_abd,
rc->rc_orig_data, rc->rc_size);
rc->rc_need_orig_restore = B_FALSE;
}
@@ -2062,9 +1876,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
if (rc->rc_devidx == ltgts[lt]) {
if (rc->rc_orig_data == NULL) {
rc->rc_orig_data =
- zio_buf_alloc(rc->rc_size);
- abd_copy_to_buf(
- rc->rc_orig_data,
+ abd_alloc_linear(
+ rc->rc_size, B_TRUE);
+ abd_copy(rc->rc_orig_data,
rc->rc_abd, rc->rc_size);
}
rc->rc_need_orig_restore = B_TRUE;
@@ -2082,10 +1896,8 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
raidz_restore_orig_data(rm);
return (EINVAL);
}
- rr->rr_code = 0;
if (dead_data > 0)
- rr->rr_code = vdev_raidz_reconstruct_row(rm, rr,
- my_tgts, t);
+ vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
}
/* Check for success */
@@ -2111,7 +1923,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
if (rc->rc_error == 0 &&
c >= rr->rr_firstdatacol) {
raidz_checksum_error(zio,
- rc, rc->rc_gdata);
+ rc, rc->rc_orig_data);
rc->rc_error =
SET_ERROR(ECKSUM);
}
@@ -2318,11 +2130,7 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
}
}
-/*
- * return 0 if no reconstruction occurred, otherwise the "code" from
- * vdev_raidz_reconstruct().
- */
-static int
+static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
raidz_row_t *rr)
{
@@ -2330,7 +2138,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
- int code = 0;
ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
@@ -2385,10 +2192,8 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
ASSERT(rr->rr_firstdatacol >= n);
- code = vdev_raidz_reconstruct_row(rm, rr, tgts, n);
+ vdev_raidz_reconstruct_row(rm, rr, tgts, n);
}
-
- return (code);
}
/*
@@ -2453,7 +2258,7 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio)
(void) zfs_ereport_start_checksum(zio->io_spa,
cvd, &zio->io_bookmark, zio, rc->rc_offset,
- rc->rc_size, (void *)(uintptr_t)c, &zbc);
+ rc->rc_size, &zbc);
mutex_enter(&cvd->vdev_stat_lock);
cvd->vdev_stat.vs_checksum_errors++;
mutex_exit(&cvd->vdev_stat_lock);
@@ -2473,8 +2278,7 @@ vdev_raidz_io_done(zio_t *zio)
} else {
for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
- rr->rr_code =
- vdev_raidz_io_done_reconstruct_known_missing(zio,
+ vdev_raidz_io_done_reconstruct_known_missing(zio,
rm, rr);
}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
index 9e9f4a80ba1d..f0f953405cb2 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fm.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -1125,8 +1125,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
*/
int
zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
- struct zio *zio, uint64_t offset, uint64_t length, void *arg,
- zio_bad_cksum_t *info)
+ struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
{
zio_cksum_report_t *report;
@@ -1144,10 +1143,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
report = kmem_zalloc(sizeof (*report), KM_SLEEP);
- if (zio->io_vsd != NULL)
- zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
- else
- zio_vsd_default_cksum_report(zio, report, arg);
+ zio_vsd_default_cksum_report(zio, report);
/* copy the checksum failure information if it was provided */
if (info != NULL) {
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
index 015dde4811e4..a90bf5feeea1 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
@@ -728,7 +728,6 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
boolean_t
zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
{
-#ifdef HAVE_KSID
uid_t gid;
#ifdef illumos
@@ -773,9 +772,6 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
*/
gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
return (groupmember(gid, cr));
-#else
- return (B_TRUE);
-#endif
}
void
diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c
index 4bb529f78838..30d5c4821ae5 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_log.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_log.c
@@ -540,6 +540,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
uint32_t blocksize = zp->z_blksz;
itx_wr_state_t write_state;
uintptr_t fsync_cnt;
+ uint64_t gen = 0;
if (zil_replaying(zilog, tx) || zp->z_unlinked ||
zfs_xattr_owner_unlinked(zp)) {
@@ -562,6 +563,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
}
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen,
+ sizeof (gen));
+
while (resid) {
itx_t *itx;
lr_write_t *lr;
@@ -609,6 +613,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
BP_ZERO(&lr->lr_blkptr);
itx->itx_private = ZTOZSB(zp);
+ itx->itx_gen = gen;
if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) &&
(fsync_cnt == 0))
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
index 0af03e9233b3..8229bc9a93e5 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -740,7 +740,8 @@ static void zfs_get_done(zgd_t *zgd, int error);
* Get data to generate a TX_WRITE intent log record.
*/
int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
+ struct lwb *lwb, zio_t *zio)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
@@ -751,6 +752,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
dmu_buf_t *db;
zgd_t *zgd;
int error = 0;
+ uint64_t zp_gen;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
@@ -769,6 +771,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
zfs_zrele_async(zp);
return (SET_ERROR(ENOENT));
}
+ /* check if generation number matches */
+ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (zp_gen)) != 0) {
+ zfs_zrele_async(zp);
+ return (SET_ERROR(EIO));
+ }
+ if (zp_gen != gen) {
+ zfs_zrele_async(zp);
+ return (SET_ERROR(ENOENT));
+ }
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 7b52f9249298..d9c3042084e3 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -1744,7 +1744,8 @@ cont:
* completed after "lwb_write_zio" completed.
*/
error = zilog->zl_get_data(itx->itx_private,
- lrwb, dbuf, lwb, lwb->lwb_write_zio);
+ itx->itx_gen, lrwb, dbuf, lwb,
+ lwb->lwb_write_zio);
if (error == EIO) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 262ca24b1443..a7820e75670b 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -3950,7 +3950,7 @@ zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
/*ARGSUSED*/
void
-zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
{
void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
@@ -4288,7 +4288,7 @@ zio_checksum_verify(zio_t *zio)
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
(void) zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, &zio->io_bookmark, zio,
- zio->io_offset, zio->io_size, NULL, &info);
+ zio->io_offset, zio->io_size, &info);
mutex_enter(&zio->io_vd->vdev_stat_lock);
zio->io_vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&zio->io_vd->vdev_stat_lock);
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 44f9832ce857..b6609363f047 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -673,7 +673,8 @@ zvol_get_done(zgd_t *zgd, int error)
* Get data to generate a TX_WRITE intent log record.
*/
int
-zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
+ struct lwb *lwb, zio_t *zio)
{
zvol_state_t *zv = arg;
uint64_t offset = lr->lr_offset;