diff options
author | Martin Matuska <mm@FreeBSD.org> | 2021-03-21 00:46:08 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2021-03-21 01:17:59 +0000 |
commit | f9693bef8dc83284e7ac905adc346f7d866b5245 (patch) | |
tree | c65ebf73ca3851248d9d03a93ce731f4fc5ddecd /sys/contrib/openzfs/module | |
parent | 815209920f1d024ca55270c106565a0b770a8c00 (diff) | |
parent | 48a1c304e82e33d5a3dd722a6ef4519dd998614b (diff) | |
download | src-f9693bef8dc83284e7ac905adc346f7d866b5245.tar.gz src-f9693bef8dc83284e7ac905adc346f7d866b5245.zip |
zfs: merge OpenZFS master-891568c99
Notable upstream pull request merges:
#11652 Split dmu_zfetch() speculation and execution parts
#11682 Fix zfs_get_data access to files with wrong generation
#11735 Clean up RAIDZ/DRAID ereport code
#11737 Initialize metaslab range trees in metaslab_init
#11739 FreeBSD: make seqc asserts conditional on replay
#11763 Allow setting bootfs property on pools with indirect vdevs
#11767 FreeBSD: Fix memory leaks in kstats
Obtained from: OpenZFS
MFC after: 2 weeks
Diffstat (limited to 'sys/contrib/openzfs/module')
30 files changed, 433 insertions, 734 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c index 6bdef466c253..43ce358298b5 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c @@ -299,15 +299,10 @@ __kstat_create(const char *module, int instance, const char *name, panic("Undefined kstat type %d\n", ksp->ks_type); } - if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) ksp->ks_data = NULL; - } else { + else ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); - if (ksp->ks_data == NULL) { - kmem_free(ksp, sizeof (*ksp)); - ksp = NULL; - } - } /* * Some kstats use a module name like "zfs/poolname" to distinguish a @@ -509,6 +504,8 @@ kstat_delete(kstat_t *ksp) sysctl_ctx_free(&ksp->ks_sysctl_ctx); ksp->ks_lock = NULL; mutex_destroy(&ksp->ks_private_lock); + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, ksp->ks_data_size); free(ksp, M_KSTAT); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 647c1463ba14..94124fdcf6c3 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -407,12 +407,6 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, &metaslab_preload_limit, 0, "Max number of metaslabs per group to preload"); -/* refcount.c */ -extern int reference_tracking_enable; -SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, - &reference_tracking_enable, 0, - "Track reference holders to refcount_t objects, used mostly by ZFS"); - /* spa.c */ extern int zfs_ccw_retry_interval; SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index d82e5f4dcf15..551a3cc8d1db 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -490,8 +490,8 @@ abd_alloc_zero_scatter(void) #define PAGE_SHIFT (highbit64(PAGESIZE)-1) #endif -#define zfs_kmap_atomic(chunk, km) ((void *)chunk) -#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) +#define zfs_kmap_atomic(chunk) ((void *)chunk) +#define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0) #define local_irq_save(flags) do { (void)(flags); } while (0) #define local_irq_restore(flags) do { (void)(flags); } while (0) #define nth_page(pg, i) \ @@ -879,8 +879,7 @@ abd_iter_map(struct abd_iter *aiter) aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, aiter->iter_abd->abd_size - aiter->iter_pos); - paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), - km_table[aiter->iter_km]); + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg)); } aiter->iter_mapaddr = (char *)paddr + offset; @@ -899,8 +898,7 @@ abd_iter_unmap(struct abd_iter *aiter) if (!abd_is_linear(aiter->iter_abd)) { /* LINTED E_FUNC_SET_NOT_USED */ - zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, - km_table[aiter->iter_km]); + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset); } ASSERT3P(aiter->iter_mapaddr, !=, NULL); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c index 8780d7f6c70a..bbccb2e572d9 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c @@ -124,7 +124,7 @@ secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) if (crgetfsuid(cr) == owner) return (0); - if (inode_owner_or_capable(ip)) + if (zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (0); #if defined(CONFIG_USER_NS) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index ff71ef4cd065..c56fd3a6ff21 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -589,9 +589,14 @@ retry: } /* bio_alloc() with __GFP_WAIT never returns NULL */ +#ifdef HAVE_BIO_MAX_SEGS + dr->dr_bio[i] = bio_alloc(GFP_NOIO, bio_max_segs( + abd_nr_pages_off(zio->io_abd, bio_size, abd_offset))); +#else dr->dr_bio[i] = bio_alloc(GFP_NOIO, MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), BIO_MAX_PAGES)); +#endif if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (SET_ERROR(ENOMEM)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index a1668e46e4f9..d33188f3822c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -590,7 +590,8 @@ struct inode * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); - igrab(ZTOZSB(zp)->z_ctldir); + /* Must have an existing ref, so igrab() cannot return NULL */ + VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL); return (ZTOZSB(zp)->z_ctldir); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c index 3b0f824115f8..3e3fda20c72c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c @@ -136,12 +136,12 @@ zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) void *paddr; cnt = MIN(bv->bv_len - skip, n); - paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1); + paddr = zfs_kmap_atomic(bv->bv_page); if (rw == UIO_READ) bcopy(p, paddr + bv->bv_offset + skip, cnt); else bcopy(paddr + bv->bv_offset + skip, p, cnt); - zfs_kunmap_atomic(paddr, KM_USER1); + zfs_kunmap_atomic(paddr); skip += cnt; if (skip == bv->bv_len) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 3cc4b560e477..5d672af0e8aa 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1734,7 +1734,11 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); } else { - igrab(*ipp); + /* + * Must have an existing ref, so igrab() + * cannot return NULL + */ + VERIFY3P(igrab(*ipp), !=, NULL); } ZFS_EXIT(zfsvfs); return (0); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 84c33b541ea3..8aeed6f568cf 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -1656,7 +1656,8 @@ out: */ /* ARGSUSED */ int -zfs_getattr_fast(struct inode *ip, struct kstat *sp) +zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, + struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -1668,7 +1669,7 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp) mutex_enter(&zp->z_lock); - generic_fillattr(ip, sp); + zpl_generic_fillattr(user_ns, ip, sp); /* * +1 link count for root inode with visible '.zfs' directory. */ diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index e6420f19ed87..9b526afd0002 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -101,12 +101,22 @@ zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) */ /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_root_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_root_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, ip, stat); +#else generic_fillattr(ip, stat); +#endif stat->atime = current_time(ip); return (0); @@ -290,8 +300,14 @@ zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ static int +#ifdef HAVE_IOPS_RENAME_USERNS +zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) +#else zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#endif { cred_t *cr = CRED(); int error; @@ -309,7 +325,7 @@ zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, return (error); } -#ifndef HAVE_RENAME_WANTS_FLAGS +#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) static int zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -333,7 +349,12 @@ zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) } static int +#ifdef HAVE_IOPS_MKDIR_USERNS +zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, + struct dentry *dentry, umode_t mode) +#else zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +#endif { cred_t *cr = CRED(); vattr_t *vap; @@ -363,14 +384,24 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) */ /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_snapdir_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); ZPL_ENTER(zfsvfs); +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, ip, stat); +#else generic_fillattr(ip, stat); +#endif stat->nlink = stat->size = 2; stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); @@ -408,7 +439,7 @@ const struct file_operations zpl_fops_snapdir = { const struct inode_operations zpl_ops_snapdir = { .lookup = zpl_snapdir_lookup, .getattr = zpl_snapdir_getattr, -#ifdef HAVE_RENAME_WANTS_FLAGS +#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_snapdir_rename2, #else .rename = zpl_snapdir_rename, @@ -495,8 +526,14 @@ zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_shares_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -506,7 +543,11 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, ZPL_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, path->dentry->d_inode, stat); +#else generic_fillattr(path->dentry->d_inode, stat); +#endif stat->nlink = stat->size = 2; stat->atime = current_time(ip); ZPL_EXIT(zfsvfs); @@ -515,7 +556,11 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { - error = -zfs_getattr_fast(ZTOI(dzp), stat); +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); +#else + error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); +#endif iput(ZTOI(dzp)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 970db4a8b73a..ea6993ffa4b0 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -869,7 +869,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) !capable(CAP_LINUX_IMMUTABLE)) return (-EACCES); - if (!inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EACCES); xva_init(xva); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index e79d334edc9b..cf0eab3e8c90 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -128,7 +128,12 @@ zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr) } static int +#ifdef HAVE_IOPS_CREATE_USERNS +zpl_create(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool flag) +#else zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) +#endif { cred_t *cr = CRED(); znode_t *zp; @@ -163,7 +168,12 @@ zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) } static int +#ifdef HAVE_IOPS_MKNOD_USERNS +zpl_mknod(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, +#else zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, +#endif dev_t rdev) { cred_t *cr = CRED(); @@ -278,7 +288,12 @@ zpl_unlink(struct inode *dir, struct dentry *dentry) } static int +#ifdef HAVE_IOPS_MKDIR_USERNS +zpl_mkdir(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode) +#else zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +#endif { cred_t *cr = CRED(); vattr_t *vap; @@ -338,8 +353,14 @@ zpl_rmdir(struct inode *dir, struct dentry *dentry) } static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { int error; fstrans_cookie_t cookie; @@ -350,7 +371,11 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, * XXX request_mask and query_flags currently ignored. */ - error = -zfs_getattr_fast(path->dentry->d_inode, stat); +#ifdef HAVE_USERNS_IOPS_GETATTR + error = -zfs_getattr_fast(user_ns, path->dentry->d_inode, stat); +#else + error = -zfs_getattr_fast(kcred->user_ns, path->dentry->d_inode, stat); +#endif spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); @@ -359,7 +384,12 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, ZPL_GETATTR_WRAPPER(zpl_getattr); static int +#ifdef HAVE_SETATTR_PREPARE_USERNS +zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry, + struct iattr *ia) +#else zpl_setattr(struct dentry *dentry, struct iattr *ia) +#endif { struct inode *ip = dentry->d_inode; cred_t *cr = CRED(); @@ -367,7 +397,7 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) int error; fstrans_cookie_t cookie; - error = setattr_prepare(dentry, ia); + error = zpl_setattr_prepare(kcred->user_ns, dentry, ia); if (error) return (error); @@ -399,8 +429,14 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) } static int +#ifdef HAVE_IOPS_RENAME_USERNS +zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) +#else zpl_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#endif { cred_t *cr = CRED(); int error; @@ -421,7 +457,7 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry, return (error); } -#ifndef HAVE_RENAME_WANTS_FLAGS +#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -431,7 +467,12 @@ zpl_rename(struct inode *sdip, struct dentry *sdentry, #endif static int +#ifdef HAVE_IOPS_SYMLINK_USERNS +zpl_symlink(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, const char *name) +#else zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) +#endif { cred_t *cr = CRED(); vattr_t *vap; @@ -593,7 +634,8 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) crhold(cr); ip->i_ctime = current_time(ip); - igrab(ip); /* Use ihold() if available */ + /* Must have an existing ref, so igrab() cannot return NULL */ + VERIFY3P(igrab(ip), !=, NULL); cookie = spl_fstrans_mark(); error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0); @@ -677,7 +719,7 @@ const struct inode_operations zpl_dir_inode_operations = { .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, -#ifdef HAVE_RENAME_WANTS_FLAGS +#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_rename2, #else .rename = zpl_rename, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index 83812f2dcba8..971cd6ad031e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -1233,7 +1233,7 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { @@ -1273,7 +1273,7 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index a6cdc017cd21..d48dc7943a24 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -1640,7 +1640,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_exit(&db->db_mtx); if (err == 0 && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - flags & DB_RF_HAVESTRUCT); + B_FALSE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); @@ -1662,6 +1662,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ if (!err && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + db->db_state != DB_CACHED, flags & DB_RF_HAVESTRUCT); } @@ -1691,7 +1692,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_exit(&db->db_mtx); if (prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, - flags & DB_RF_HAVESTRUCT); + B_TRUE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index b46bf60d1a29..1c47430953b1 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -497,10 +497,12 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; + zstream_t *zs = NULL; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio = NULL; + boolean_t missed = B_FALSE; ASSERT(length <= DMU_MAX_ACCESS); @@ -536,9 +538,21 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); + if ((flags & DMU_READ_NO_PREFETCH) == 0 && + DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + /* + * Prepare the zfetch before initiating the demand reads, so + * that if multiple threads block on same indirect block, we + * base predictions on the original less racy request order. + */ + zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, + read && DNODE_IS_CACHEABLE(dn), B_TRUE); + } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { + if (zs) + dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) @@ -546,20 +560,27 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, return (SET_ERROR(EIO)); } - /* initiate async i/o */ - if (read) + /* + * Initiate async demand data read. + * We check the db_state after calling dbuf_read() because + * (1) dbuf_read() may change the state to CACHED due to a + * hit in the ARC, and (2) on a cache miss, a child will + * have been added to "zio" but not yet completed, so the + * state will not yet be CACHED. + */ + if (read) { (void) dbuf_read(db, zio, dbuf_flags); + if (db->db_state != DB_CACHED) + missed = B_TRUE; + } dbp[i] = &db->db; } if (!read) zfs_racct_write(length, nblks); - if ((flags & DMU_READ_NO_PREFETCH) == 0 && - DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { - dmu_zfetch(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn), B_TRUE); - } + if (zs) + dmu_zfetch_run(zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 5d061fe3813e..3d7407016d2c 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -59,8 +59,6 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; - kstat_named_t zfetchstat_max_completion_us; - kstat_named_t zfetchstat_last_completion_us; kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; @@ -68,8 +66,6 @@ static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, - { "max_completion_us", KSTAT_DATA_UINT64 }, - { "last_completion_us", KSTAT_DATA_UINT64 }, { "io_issued", KSTAT_DATA_UINT64 }, }; @@ -129,7 +125,7 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) static void dmu_zfetch_stream_fini(zstream_t *zs) { - mutex_destroy(&zs->zs_lock); + ASSERT(!list_link_active(&zs->zs_node)); kmem_free(zs, sizeof (*zs)); } @@ -138,17 +134,10 @@ dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { ASSERT(MUTEX_HELD(&zf->zf_lock)); list_remove(&zf->zf_stream, zs); - dmu_zfetch_stream_fini(zs); - zf->zf_numstreams--; -} - -static void -dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs) -{ - ASSERT(MUTEX_HELD(&zf->zf_lock)); - list_remove(&zf->zf_stream, zs); - zs->zs_fetch = NULL; zf->zf_numstreams--; + membar_producer(); + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); } /* @@ -161,12 +150,8 @@ dmu_zfetch_fini(zfetch_t *zf) zstream_t *zs; mutex_enter(&zf->zf_lock); - while ((zs = list_head(&zf->zf_stream)) != NULL) { - if (zfs_refcount_count(&zs->zs_blocks) != 0) - dmu_zfetch_stream_orphan(zf, zs); - else - dmu_zfetch_stream_remove(zf, zs); - } + while ((zs = list_head(&zf->zf_stream)) != NULL) + dmu_zfetch_stream_remove(zf, zs); mutex_exit(&zf->zf_lock); list_destroy(&zf->zf_stream); mutex_destroy(&zf->zf_lock); @@ -195,9 +180,9 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); /* - * Skip gethrtime() call if there are still references + * Skip if still active. 1 -- zf_stream reference. */ - if (zfs_refcount_count(&zs->zs_blocks) != 0) + if (zfs_refcount_count(&zs->zs_refs) != 1) continue; if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) @@ -222,12 +207,17 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); zs->zs_blkid = blkid; + zs->zs_pf_blkid1 = blkid; zs->zs_pf_blkid = blkid; + zs->zs_ipf_blkid1 = blkid; zs->zs_ipf_blkid = blkid; zs->zs_atime = now; zs->zs_fetch = zf; - zfs_refcount_create(&zs->zs_blocks); - mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); + zs->zs_missed = B_FALSE; + zfs_refcount_create(&zs->zs_callers); + zfs_refcount_create(&zs->zs_refs); + /* One reference for zf_stream. */ + zfs_refcount_add(&zs->zs_refs, NULL); zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } @@ -237,48 +227,36 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued) { zstream_t *zs = arg; - if (zs->zs_start_time && io_issued) { - hrtime_t now = gethrtime(); - hrtime_t delta = NSEC2USEC(now - zs->zs_start_time); - - zs->zs_start_time = 0; - ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta); - if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us)) - ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta); - } - - if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0) - return; - - /* - * The parent fetch structure has gone away - */ - if (zs->zs_fetch == NULL) + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); } /* - * This is the predictive prefetch entry point. It associates dnode access - * specified with blkid and nblks arguments with prefetch stream, predicts - * further accesses based on that stats and initiates speculative prefetch. + * This is the predictive prefetch entry point. dmu_zfetch_prepare() + * associates dnode access specified with blkid and nblks arguments with + * prefetch stream, predicts further accesses based on that stats and returns + * the stream pointer on success. That pointer must later be passed to + * dmu_zfetch_run() to initiate the speculative prefetch for the stream and + * release it. dmu_zfetch() is a wrapper for simple cases when window between + * prediction and prefetch initiation is not needed. * fetch_data argument specifies whether actual data blocks should be fetched: * FALSE -- prefetch only indirect blocks for predicted data blocks; * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ -void -dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, - boolean_t have_lock) +zstream_t * +dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, + boolean_t fetch_data, boolean_t have_lock) { zstream_t *zs; - int64_t pf_start, ipf_start, ipf_istart, ipf_iend; + int64_t pf_start, ipf_start; int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued; - uint64_t end_of_access_blkid; + int max_dist_blks, pf_nblks, ipf_nblks; + uint64_t end_of_access_blkid, maxblkid; end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; if (zfs_prefetch_disable) - return; + return (NULL); /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on @@ -287,14 +265,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * blocks (e.g. of the MOS's dnode object). */ if (!spa_indirect_vdevs_loaded(spa)) - return; + return (NULL); /* * As a fast path for small (single-block) files, ignore access * to the first block. */ if (!have_lock && blkid == 0) - return; + return (NULL); if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); @@ -303,10 +281,11 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * A fast path for small files for which no prefetch will * happen. */ - if (zf->zf_dnode->dn_maxblkid < 2) { + maxblkid = zf->zf_dnode->dn_maxblkid; + if (maxblkid < 2) { if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return; + return (NULL); } mutex_enter(&zf->zf_lock); @@ -317,45 +296,47 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, */ for (zs = list_head(&zf->zf_stream); zs != NULL; zs = list_next(&zf->zf_stream, zs)) { - if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) { - mutex_enter(&zs->zs_lock); - /* - * zs_blkid could have changed before we - * acquired zs_lock; re-check them here. - */ - if (blkid == zs->zs_blkid) { - break; - } else if (blkid + 1 == zs->zs_blkid) { - blkid++; - nblks--; - if (nblks == 0) { - /* Already prefetched this before. */ - mutex_exit(&zs->zs_lock); - mutex_exit(&zf->zf_lock); - if (!have_lock) { - rw_exit(&zf->zf_dnode-> - dn_struct_rwlock); - } - return; - } - break; - } - mutex_exit(&zs->zs_lock); + if (blkid == zs->zs_blkid) { + break; + } else if (blkid + 1 == zs->zs_blkid) { + blkid++; + nblks--; + break; } } + /* + * If the file is ending, remove the matching stream if found. + * If not found then it is too late to create a new one now. + */ + if (end_of_access_blkid >= maxblkid) { + if (zs != NULL) + dmu_zfetch_stream_remove(zf, zs); + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return (NULL); + } + + /* Exit if we already prefetched this block before. */ + if (nblks == 0) { + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return (NULL); + } + if (zs == NULL) { /* * This access is not part of any existing stream. Create * a new stream for it. */ - ZFETCHSTAT_BUMP(zfetchstat_misses); - dmu_zfetch_stream_create(zf, end_of_access_blkid); mutex_exit(&zf->zf_lock); if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - return; + ZFETCHSTAT_BUMP(zfetchstat_misses); + return (NULL); } /* @@ -369,6 +350,10 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * start just after the block we just accessed. */ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); + if (zs->zs_pf_blkid1 < end_of_access_blkid) + zs->zs_pf_blkid1 = end_of_access_blkid; + if (zs->zs_ipf_blkid1 < end_of_access_blkid) + zs->zs_ipf_blkid1 = end_of_access_blkid; /* * Double our amount of prefetched data, but don't let the @@ -407,49 +392,108 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * (i.e. the amount read now + the amount of data prefetched now). */ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; - max_blks = max_dist_blks - (ipf_start - end_of_access_blkid); + max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid); ipf_nblks = MIN(pf_ahead_blks, max_blks); zs->zs_ipf_blkid = ipf_start + ipf_nblks; - epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; - ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; - ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; - - zs->zs_atime = gethrtime(); - /* no prior reads in progress */ - if (zfs_refcount_count(&zs->zs_blocks) == 0) - zs->zs_start_time = zs->zs_atime; zs->zs_blkid = end_of_access_blkid; - zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart, - NULL); - mutex_exit(&zs->zs_lock); + /* Protect the stream from reclamation. */ + zs->zs_atime = gethrtime(); + zfs_refcount_add(&zs->zs_refs, NULL); + /* Count concurrent callers. */ + zfs_refcount_add(&zs->zs_callers, NULL); mutex_exit(&zf->zf_lock); - issued = 0; + + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + + ZFETCHSTAT_BUMP(zfetchstat_hits); + return (zs); +} + +void +dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) +{ + zfetch_t *zf = zs->zs_fetch; + int64_t pf_start, pf_end, ipf_start, ipf_end; + int epbs, issued; + + if (missed) + zs->zs_missed = missed; /* - * dbuf_prefetch() is asynchronous (even when it needs to read - * indirect blocks), but we still prefer to drop our locks before - * calling it to reduce the time we hold them. + * Postpone the prefetch if there are more concurrent callers. + * It happens when multiple requests are waiting for the same + * indirect block. The last one will run the prefetch for all. */ + if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { + /* Drop reference taken in dmu_zfetch_prepare(). */ + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); + return; + } - for (int i = 0; i < pf_nblks; i++) { - issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i, + mutex_enter(&zf->zf_lock); + if (zs->zs_missed) { + pf_start = zs->zs_pf_blkid1; + pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid; + } else { + pf_start = pf_end = 0; + } + ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1); + ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid; + mutex_exit(&zf->zf_lock); + ASSERT3S(pf_start, <=, pf_end); + ASSERT3S(ipf_start, <=, ipf_end); + + epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; + ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; + ASSERT3S(ipf_start, <=, ipf_end); + issued = pf_end - pf_start + ipf_end - ipf_start; + if (issued > 1) { + /* More references on top of taken in dmu_zfetch_prepare(). */ + zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL); + } else if (issued == 0) { + /* Some other thread has done our work, so drop the ref. */ + if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) + dmu_zfetch_stream_fini(zs); + return; + } + + if (!have_lock) + rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + issued = 0; + for (int64_t blk = pf_start; blk < pf_end; blk++) { + issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, dmu_zfetch_stream_done, zs); } - for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { + for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, dmu_zfetch_stream_done, zs); } + if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); - ZFETCHSTAT_BUMP(zfetchstat_hits); if (issued) ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); } +void +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, + boolean_t missed, boolean_t have_lock) +{ + zstream_t *zs; + + zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); + if (zs) + dmu_zfetch_run(zs, missed, have_lock); +} + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, "Disable all ZFS prefetching"); diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index bc4f007b61a1..463806c6078a 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -2316,18 +2316,13 @@ metaslab_load_impl(metaslab_t *msp) range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); - if (msp->ms_freed != NULL) { + if (msp->ms_new) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. - * - * Note: ms_freed and all the other trees except for - * the ms_allocatable, can be NULL at this point only - * if this is a new metaslab of a vdev that just got - * expanded. */ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); @@ -2365,8 +2360,6 @@ metaslab_load_impl(metaslab_t *msp) range_tree_walk(msp->ms_unflushed_frees, range_tree_add, msp->ms_allocatable); - msp->ms_loaded = B_TRUE; - ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { @@ -2680,19 +2673,31 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } - range_seg_type_t type; uint64_t shift, start; - type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, ms, &start, &shift); - /* - * We create the ms_allocatable here, but we don't create the - * other range trees until metaslab_sync_done(). This serves - * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that - * we'd data fault on any attempt to use this metaslab before - * it's ready. - */ ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); + for (int t = 0; t < TXG_SIZE; t++) { + ms->ms_allocating[t] = range_tree_create(NULL, type, + NULL, start, shift); + } + ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); + ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + ms->ms_defer[t] = range_tree_create(NULL, type, NULL, + start, shift); + } + ms->ms_checkpointing = + range_tree_create(NULL, type, NULL, start, shift); + ms->ms_unflushed_allocs = + range_tree_create(NULL, type, NULL, start, shift); + + metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + mrap->mra_bt = &ms->ms_unflushed_frees_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + type, mrap, start, shift); ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); @@ -2765,13 +2770,13 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); + /* - * If the range trees haven't been allocated, this metaslab hasn't - * been through metaslab_sync_done() for the first time yet, so its + * If this metaslab hasn't been through metaslab_sync_done() yet its * space hasn't been accounted for in its vdev and doesn't need to be * subtracted. */ - if (msp->ms_freed != NULL) { + if (!msp->ms_new) { metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); @@ -2782,27 +2787,24 @@ metaslab_fini(metaslab_t *msp) metaslab_unload(msp); range_tree_destroy(msp->ms_allocatable); + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); - if (msp->ms_freed != NULL) { - range_tree_destroy(msp->ms_freeing); - range_tree_destroy(msp->ms_freed); + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_destroy(msp->ms_checkpointing); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); - ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, - metaslab_unflushed_changes_memused(msp)); - spa->spa_unflushed_stats.sus_memused -= - metaslab_unflushed_changes_memused(msp); - range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_allocs); - range_tree_destroy(msp->ms_checkpointing); - range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_frees); - - for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_allocating[t]); - } - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defer[t]); - } + for (int t = 0; t < TXG_SIZE; t++) { + range_tree_destroy(msp->ms_allocating[t]); + } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); @@ -3926,17 +3928,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) /* * This metaslab has just been added so there's no work to do now. */ - if (msp->ms_freeing == NULL) { - ASSERT3P(alloctree, ==, NULL); + if (msp->ms_new) { + ASSERT0(range_tree_space(alloctree)); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + ASSERT0(range_tree_space(msp->ms_trim)); return; } - ASSERT3P(alloctree, !=, NULL); - ASSERT3P(msp->ms_freeing, !=, NULL); - ASSERT3P(msp->ms_freed, !=, NULL); - ASSERT3P(msp->ms_checkpointing, !=, NULL); - ASSERT3P(msp->ms_trim, !=, NULL); - /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being @@ -4240,54 +4240,15 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); - /* - * If this metaslab is just becoming available, initialize its - * range trees and add its capacity to the vdev. - */ - if (msp->ms_freed == NULL) { - range_seg_type_t type; - uint64_t shift, start; - type = metaslab_calculate_range_tree_type(vd, msp, &start, - &shift); - - for (int t = 0; t < TXG_SIZE; t++) { - ASSERT(msp->ms_allocating[t] == NULL); - - msp->ms_allocating[t] = range_tree_create(NULL, type, - NULL, start, shift); - } - - ASSERT3P(msp->ms_freeing, ==, NULL); - msp->ms_freeing = range_tree_create(NULL, type, NULL, start, - shift); - - ASSERT3P(msp->ms_freed, ==, NULL); - msp->ms_freed = range_tree_create(NULL, type, NULL, start, - shift); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ASSERT3P(msp->ms_defer[t], ==, NULL); - msp->ms_defer[t] = range_tree_create(NULL, type, NULL, - start, shift); - } - - ASSERT3P(msp->ms_checkpointing, ==, NULL); - msp->ms_checkpointing = range_tree_create(NULL, type, NULL, - start, shift); - - ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); - msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, - start, shift); - - metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); - mrap->mra_bt = &msp->ms_unflushed_frees_by_size; - mrap->mra_floor_shift = metaslab_by_size_min_shift; - ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, - type, mrap, start, shift); - + if (msp->ms_new) { + /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); + + /* there should be no allocations nor frees at this point */ + VERIFY0(msp->ms_allocated_this_txg); + VERIFY0(range_tree_space(msp->ms_freed)); } + ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c index 39476261edfb..a3877b8d15f6 100644 --- a/sys/contrib/openzfs/module/zfs/refcount.c +++ b/sys/contrib/openzfs/module/zfs/refcount.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2021 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -324,4 +324,12 @@ zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) mutex_exit(&rc->rc_mtx); return (B_TRUE); } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, ,reference_tracking_enable, INT, ZMOD_RW, + "Track reference holders to refcount_t objects"); + +ZFS_MODULE_PARAM(zfs, ,reference_history, INT, ZMOD_RW, + "Maximum reference holders being tracked"); +/* END CSTYLED */ #endif /* ZFS_DEBUG */ diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index ad4f3efb87b1..c536a1c6cda0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -5105,10 +5105,8 @@ vdev_is_bootable(vdev_t *vd) if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; - if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || - strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { + if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); - } } for (int c = 0; c < vd->vdev_children; c++) { diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c index a4f48cf744b0..fb2143e94689 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_draid.c +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -632,236 +632,6 @@ vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) return (group * vdc->vdc_groupsz); } - -static void -vdev_draid_map_free_vsd(zio_t *zio) -{ - raidz_map_t *rm = zio->io_vsd; - - ASSERT0(rm->rm_freed); - rm->rm_freed = B_TRUE; - - if (rm->rm_reports == 0) { - vdev_raidz_map_free(rm); - } -} - -/*ARGSUSED*/ -static void -vdev_draid_cksum_free(void *arg, size_t ignored) -{ - raidz_map_t *rm = arg; - - ASSERT3U(rm->rm_reports, >, 0); - - if (--rm->rm_reports == 0 && rm->rm_freed) - vdev_raidz_map_free(rm); -} - -static void -vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) -{ - raidz_map_t *rm = zcr->zcr_cbdata; - const size_t c = zcr->zcr_cbinfo; - uint64_t skip_size = zcr->zcr_sector; - uint64_t parity_size; - size_t x, offset, size; - - if (good_data == NULL) { - zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); - return; - } - - /* - * Detailed cksum reporting is currently only supported for single - * row draid mappings, this covers the vast majority of zios. Only - * a dRAID zio which spans groups will have multiple rows. - */ - if (rm->rm_nrows != 1) { - zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); - return; - } - - raidz_row_t *rr = rm->rm_row[0]; - const abd_t *good = NULL; - const abd_t *bad = rr->rr_col[c].rc_abd; - - if (c < rr->rr_firstdatacol) { - /* - * The first time through, calculate the parity blocks for - * the good data (this relies on the fact that the good - * data never changes for a given logical zio) - */ - if (rr->rr_col[0].rc_gdata == NULL) { - abd_t *bad_parity[VDEV_DRAID_MAXPARITY]; - - /* - * Set up the rr_col[]s to generate the parity for - * good_data, first saving the parity bufs and - * replacing them with buffers to hold the result. - */ - for (x = 0; x < rr->rr_firstdatacol; x++) { - bad_parity[x] = rr->rr_col[x].rc_abd; - rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = - abd_alloc_sametype(rr->rr_col[x].rc_abd, - rr->rr_col[x].rc_size); - } - - /* - * Fill in the data columns from good_data being - * careful to pad short columns and empty columns - * with a skip sector. - */ - uint64_t good_size = abd_get_size((abd_t *)good_data); - - offset = 0; - for (; x < rr->rr_cols; x++) { - abd_free(rr->rr_col[x].rc_abd); - - if (offset == good_size) { - /* empty data column (small write) */ - rr->rr_col[x].rc_abd = - abd_get_zeros(skip_size); - } else if (x < rr->rr_bigcols) { - /* this is a "big column" */ - size = rr->rr_col[x].rc_size; - rr->rr_col[x].rc_abd = - abd_get_offset_size( - (abd_t *)good_data, offset, size); - offset += size; - } else { - /* short data column, add skip sector */ - size = rr->rr_col[x].rc_size -skip_size; - rr->rr_col[x].rc_abd = abd_alloc( - rr->rr_col[x].rc_size, B_TRUE); - abd_copy_off(rr->rr_col[x].rc_abd, - (abd_t *)good_data, 0, offset, - size); - abd_zero_off(rr->rr_col[x].rc_abd, - size, skip_size); - offset += size; - } - } - - /* - * Construct the parity from the good data. - */ - vdev_raidz_generate_parity_row(rm, rr); - - /* restore everything back to its original state */ - for (x = 0; x < rr->rr_firstdatacol; x++) - rr->rr_col[x].rc_abd = bad_parity[x]; - - offset = 0; - for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { - abd_free(rr->rr_col[x].rc_abd); - rr->rr_col[x].rc_abd = abd_get_offset_size( - rr->rr_abd_copy, offset, - rr->rr_col[x].rc_size); - offset += rr->rr_col[x].rc_size; - } - } - - ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); - good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, - rr->rr_col[c].rc_size); - } else { - /* adjust good_data to point at the start of our column */ - parity_size = size = rr->rr_col[0].rc_size; - if (c >= rr->rr_bigcols) { - size -= skip_size; - zcr->zcr_length = size; - } - - /* empty column */ - if (size == 0) { - zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE); - return; - } - - offset = 0; - for (x = rr->rr_firstdatacol; x < c; x++) { - if (x < rr->rr_bigcols) { - offset += parity_size; - } else { - offset += parity_size - skip_size; - } - } - - good = abd_get_offset_size((abd_t *)good_data, offset, size); - } - - /* we drop the ereport if it ends up that the data was good */ - zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_free((abd_t *)good); -} - -/* - * Invoked indirectly by zfs_ereport_start_checksum(), called - * below when our read operation fails completely. The main point - * is to keep a copy of everything we read from disk, so that at - * vdev_draid_cksum_finish() time we can compare it with the good data. - */ -static void -vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) -{ - size_t c = (size_t)(uintptr_t)arg; - raidz_map_t *rm = zio->io_vsd; - - /* set up the report and bump the refcount */ - zcr->zcr_cbdata = rm; - zcr->zcr_cbinfo = c; - zcr->zcr_finish = vdev_draid_cksum_finish; - zcr->zcr_free = vdev_draid_cksum_free; - - rm->rm_reports++; - ASSERT3U(rm->rm_reports, >, 0); - - if (rm->rm_row[0]->rr_abd_copy != NULL) - return; - - /* - * It's the first time we're called for this raidz_map_t, so we need - * to copy the data aside; there's no guarantee that our zio's buffer - * won't be re-used for something else. - * - * Our parity data is already in separate buffers, so there's no need - * to copy them. Furthermore, all columns should have been expanded - * by vdev_draid_map_alloc_empty() when attempting reconstruction. - */ - for (int i = 0; i < rm->rm_nrows; i++) { - raidz_row_t *rr = rm->rm_row[i]; - size_t offset = 0; - size_t size = 0; - - for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { - ASSERT3U(rr->rr_col[c].rc_size, ==, - rr->rr_col[0].rc_size); - size += rr->rr_col[c].rc_size; - } - - rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); - - for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { - raidz_col_t *col = &rr->rr_col[c]; - abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, - offset, col->rc_size); - - abd_copy(tmp, col->rc_abd, col->rc_size); - abd_free(col->rc_abd); - - col->rc_abd = tmp; - offset += col->rc_size; - } - ASSERT3U(offset, ==, size); - } -} - -const zio_vsd_ops_t vdev_draid_vsd_ops = { - .vsd_free = vdev_draid_map_free_vsd, - .vsd_cksum_report = vdev_draid_cksum_report -}; - /* * Full stripe writes. When writing, all columns (D+P) are required. Parity * is calculated over all the columns, including empty zero filled sectors, @@ -1208,7 +978,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rr->rr_missingdata = 0; rr->rr_missingparity = 0; rr->rr_firstdatacol = vdc->vdc_nparity; - rr->rr_abd_copy = NULL; rr->rr_abd_empty = NULL; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; @@ -1230,7 +999,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; rc->rc_abd = NULL; - rc->rc_gdata = NULL; rc->rc_orig_data = NULL; rc->rc_error = 0; rc->rc_tried = 0; @@ -1328,9 +1096,6 @@ vdev_draid_map_alloc(zio_t *zio) if (nrows == 2) rm->rm_row[1] = rr[1]; - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_draid_vsd_ops; - return (rm); } @@ -2183,12 +1948,13 @@ static void vdev_draid_io_start(zio_t *zio) { vdev_t *vd __maybe_unused = zio->io_vd; - raidz_map_t *rm; ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); - rm = vdev_draid_map_alloc(zio); + raidz_map_t *rm = vdev_draid_map_alloc(zio); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index 416f4c54d8e8..bafb2c767b2e 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -315,7 +315,6 @@ vdev_indirect_map_free(zio_t *zio) static const zio_vsd_ops_t vdev_indirect_vsd_ops = { .vsd_free = vdev_indirect_map_free, - .vsd_cksum_report = zio_vsd_default_cksum_report }; /* diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c index 71ca43caec1a..f360a18c0041 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c +++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c @@ -174,7 +174,6 @@ vdev_mirror_map_free(zio_t *zio) static const zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, - .vsd_cksum_report = zio_vsd_default_cksum_report }; static int @@ -379,8 +378,6 @@ vdev_mirror_map_init(zio_t *zio) } } - zio->io_vsd = mm; - zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); } @@ -629,6 +626,8 @@ vdev_mirror_io_start(zio_t *zio) int c, children; mm = vdev_mirror_map_init(zio); + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 57a594c80ce3..db753ec16fd3 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -143,15 +143,10 @@ vdev_raidz_row_free(raidz_row_t *rr) if (rc->rc_size != 0) abd_free(rc->rc_abd); - if (rc->rc_gdata != NULL) - abd_free(rc->rc_gdata); if (rc->rc_orig_data != NULL) - zio_buf_free(rc->rc_orig_data, rc->rc_size); + abd_free(rc->rc_orig_data); } - if (rr->rr_abd_copy != NULL) - abd_free(rr->rr_abd_copy); - if (rr->rr_abd_empty != NULL) abd_free(rr->rr_abd_empty); @@ -172,175 +167,11 @@ vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; - ASSERT0(rm->rm_freed); - rm->rm_freed = B_TRUE; - - if (rm->rm_reports == 0) { - vdev_raidz_map_free(rm); - } -} - -/*ARGSUSED*/ -static void -vdev_raidz_cksum_free(void *arg, size_t ignored) -{ - raidz_map_t *rm = arg; - - ASSERT3U(rm->rm_reports, >, 0); - - if (--rm->rm_reports == 0 && rm->rm_freed) - vdev_raidz_map_free(rm); -} - -static void -vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) -{ - raidz_map_t *rm = zcr->zcr_cbdata; - const size_t c = zcr->zcr_cbinfo; - size_t x, offset; - - if (good_data == NULL) { - zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); - return; - } - - ASSERT3U(rm->rm_nrows, ==, 1); - raidz_row_t *rr = rm->rm_row[0]; - - const abd_t *good = NULL; - const abd_t *bad = rr->rr_col[c].rc_abd; - - if (c < rr->rr_firstdatacol) { - /* - * The first time through, calculate the parity blocks for - * the good data (this relies on the fact that the good - * data never changes for a given logical ZIO) - */ - if (rr->rr_col[0].rc_gdata == NULL) { - abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; - - /* - * Set up the rr_col[]s to generate the parity for - * good_data, first saving the parity bufs and - * replacing them with buffers to hold the result. - */ - for (x = 0; x < rr->rr_firstdatacol; x++) { - bad_parity[x] = rr->rr_col[x].rc_abd; - rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = - abd_alloc_sametype(rr->rr_col[x].rc_abd, - rr->rr_col[x].rc_size); - } - - /* fill in the data columns from good_data */ - offset = 0; - for (; x < rr->rr_cols; x++) { - abd_free(rr->rr_col[x].rc_abd); - - rr->rr_col[x].rc_abd = - abd_get_offset_size((abd_t *)good_data, - offset, rr->rr_col[x].rc_size); - offset += rr->rr_col[x].rc_size; - } - - /* - * Construct the parity from the good data. - */ - vdev_raidz_generate_parity_row(rm, rr); - - /* restore everything back to its original state */ - for (x = 0; x < rr->rr_firstdatacol; x++) - rr->rr_col[x].rc_abd = bad_parity[x]; - - offset = 0; - for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { - abd_free(rr->rr_col[x].rc_abd); - rr->rr_col[x].rc_abd = abd_get_offset_size( - rr->rr_abd_copy, offset, - rr->rr_col[x].rc_size); - offset += rr->rr_col[x].rc_size; - } - } - - ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); - good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, - rr->rr_col[c].rc_size); - } else { - /* adjust good_data to point at the start of our column */ - offset = 0; - for (x = rr->rr_firstdatacol; x < c; x++) - offset += rr->rr_col[x].rc_size; - - good = abd_get_offset_size((abd_t *)good_data, offset, - rr->rr_col[c].rc_size); - } - - /* we drop the ereport if it ends up that the data was good */ - zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_free((abd_t *)good); -} - -/* - * Invoked indirectly by zfs_ereport_start_checksum(), called - * below when our read operation fails completely. The main point - * is to keep a copy of everything we read from disk, so that at - * vdev_raidz_cksum_finish() time we can compare it with the good data. - */ -static void -vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) -{ - size_t c = (size_t)(uintptr_t)arg; - raidz_map_t *rm = zio->io_vsd; - - /* set up the report and bump the refcount */ - zcr->zcr_cbdata = rm; - zcr->zcr_cbinfo = c; - zcr->zcr_finish = vdev_raidz_cksum_finish; - zcr->zcr_free = vdev_raidz_cksum_free; - - rm->rm_reports++; - ASSERT3U(rm->rm_reports, >, 0); - ASSERT3U(rm->rm_nrows, ==, 1); - - if (rm->rm_row[0]->rr_abd_copy != NULL) - return; - - /* - * It's the first time we're called for this raidz_map_t, so we need - * to copy the data aside; there's no guarantee that our zio's buffer - * won't be re-used for something else. - * - * Our parity data is already in separate buffers, so there's no need - * to copy them. - */ - for (int i = 0; i < rm->rm_nrows; i++) { - raidz_row_t *rr = rm->rm_row[i]; - size_t offset = 0; - size_t size = 0; - - for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) - size += rr->rr_col[c].rc_size; - - rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); - - for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { - raidz_col_t *col = &rr->rr_col[c]; - abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, - offset, col->rc_size); - - abd_copy(tmp, col->rc_abd, col->rc_size); - - abd_free(col->rc_abd); - col->rc_abd = tmp; - - offset += col->rc_size; - } - ASSERT3U(offset, ==, size); - } + vdev_raidz_map_free(rm); } -static const zio_vsd_ops_t vdev_raidz_vsd_ops = { +const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, - .vsd_cksum_report = vdev_raidz_cksum_report }; /* @@ -414,7 +245,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rr->rr_missingdata = 0; rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; - rr->rr_abd_copy = NULL; rr->rr_abd_empty = NULL; rr->rr_nempty = 0; #ifdef ZFS_DEBUG @@ -435,7 +265,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rc->rc_devidx = col; rc->rc_offset = coff; rc->rc_abd = NULL; - rc->rc_gdata = NULL; rc->rc_orig_data = NULL; rc->rc_error = 0; rc->rc_tried = 0; @@ -831,7 +660,7 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) return (0); } -static int +static void vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; @@ -860,11 +689,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_p_func, NULL); } - - return (1 << VDEV_RAIDZ_P); } -static int +static void vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; @@ -905,11 +732,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) struct reconst_q_struct rq = { abd_to_buf(src), exp }; (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); - - return (1 << VDEV_RAIDZ_Q); } -static int +static void vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; @@ -995,8 +820,6 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) */ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; - - return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } /* BEGIN CSTYLED */ @@ -1355,7 +1178,7 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, kmem_free(p, psize); } -static int +static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; @@ -1370,8 +1193,6 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) abd_t **bufs = NULL; - int code = 0; - /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. @@ -1426,15 +1247,10 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) continue; } - code |= 1 << c; - parity_map[i] = c; i++; } - ASSERT(code != 0); - ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); - psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); @@ -1497,18 +1313,15 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) } kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } - - return (code); } -static int +static void vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c, ret; - int code; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; @@ -1541,20 +1354,24 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) - return (ret); + return; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rr, dt, 1)); + if (parity_valid[VDEV_RAIDZ_P]) { + vdev_raidz_reconstruct_p(rr, dt, 1); + return; + } ASSERT(rr->rr_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rr, dt, 1)); + if (parity_valid[VDEV_RAIDZ_Q]) { + vdev_raidz_reconstruct_q(rr, dt, 1); + return; + } ASSERT(rr->rr_firstdatacol > 2); break; @@ -1563,18 +1380,17 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rr, dt, 2)); + parity_valid[VDEV_RAIDZ_Q]) { + vdev_raidz_reconstruct_pq(rr, dt, 2); + return; + } ASSERT(rr->rr_firstdatacol > 2); break; } - code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); - ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); - ASSERT(code > 0); - return (code); + vdev_raidz_reconstruct_general(rr, tgts, ntgts); } static int @@ -1811,10 +1627,11 @@ vdev_raidz_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; - raidz_map_t *rm; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, + raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vdrz->vd_logical_width, vdrz->vd_nparity); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; /* * Until raidz expansion is implemented all maps for a raidz vdev @@ -1823,9 +1640,6 @@ vdev_raidz_io_start(zio_t *zio) ASSERT3U(rm->rm_nrows, ==, 1); raidz_row_t *rr = rm->rm_row[0]; - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; - if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); } else { @@ -2021,7 +1835,7 @@ raidz_restore_orig_data(raidz_map_t *rm) for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { - abd_copy_from_buf(rc->rc_abd, + abd_copy(rc->rc_abd, rc->rc_orig_data, rc->rc_size); rc->rc_need_orig_restore = B_FALSE; } @@ -2062,9 +1876,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) if (rc->rc_devidx == ltgts[lt]) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = - zio_buf_alloc(rc->rc_size); - abd_copy_to_buf( - rc->rc_orig_data, + abd_alloc_linear( + rc->rc_size, B_TRUE); + abd_copy(rc->rc_orig_data, rc->rc_abd, rc->rc_size); } rc->rc_need_orig_restore = B_TRUE; @@ -2082,10 +1896,8 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) raidz_restore_orig_data(rm); return (EINVAL); } - rr->rr_code = 0; if (dead_data > 0) - rr->rr_code = vdev_raidz_reconstruct_row(rm, rr, - my_tgts, t); + vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); } /* Check for success */ @@ -2111,7 +1923,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { raidz_checksum_error(zio, - rc, rc->rc_gdata); + rc, rc->rc_orig_data); rc->rc_error = SET_ERROR(ECKSUM); } @@ -2318,11 +2130,7 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) } } -/* - * return 0 if no reconstruction occurred, otherwise the "code" from - * vdev_raidz_reconstruct(). - */ -static int +static void vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr) { @@ -2330,7 +2138,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int code = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); @@ -2385,10 +2192,8 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT(rr->rr_firstdatacol >= n); - code = vdev_raidz_reconstruct_row(rm, rr, tgts, n); + vdev_raidz_reconstruct_row(rm, rr, tgts, n); } - - return (code); } /* @@ -2453,7 +2258,7 @@ vdev_raidz_io_done_unrecoverable(zio_t *zio) (void) zfs_ereport_start_checksum(zio->io_spa, cvd, &zio->io_bookmark, zio, rc->rc_offset, - rc->rc_size, (void *)(uintptr_t)c, &zbc); + rc->rc_size, &zbc); mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); @@ -2473,8 +2278,7 @@ vdev_raidz_io_done(zio_t *zio) } else { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; - rr->rr_code = - vdev_raidz_io_done_reconstruct_known_missing(zio, + vdev_raidz_io_done_reconstruct_known_missing(zio, rm, rr); } diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index 9e9f4a80ba1d..f0f953405cb2 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -1125,8 +1125,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, */ int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, - struct zio *zio, uint64_t offset, uint64_t length, void *arg, - zio_bad_cksum_t *info) + struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info) { zio_cksum_report_t *report; @@ -1144,10 +1143,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, report = kmem_zalloc(sizeof (*report), KM_SLEEP); - if (zio->io_vsd != NULL) - zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); - else - zio_vsd_default_cksum_report(zio, report, arg); + zio_vsd_default_cksum_report(zio, report); /* copy the checksum failure information if it was provided */ if (info != NULL) { diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c index 015dde4811e4..a90bf5feeea1 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c @@ -728,7 +728,6 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp) boolean_t zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) { -#ifdef HAVE_KSID uid_t gid; #ifdef illumos @@ -773,9 +772,6 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) */ gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); return (groupmember(gid, cr)); -#else - return (B_TRUE); -#endif } void diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c index 4bb529f78838..30d5c4821ae5 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_log.c +++ b/sys/contrib/openzfs/module/zfs/zfs_log.c @@ -540,6 +540,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint32_t blocksize = zp->z_blksz; itx_wr_state_t write_state; uintptr_t fsync_cnt; + uint64_t gen = 0; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -562,6 +563,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, + sizeof (gen)); + while (resid) { itx_t *itx; lr_write_t *lr; @@ -609,6 +613,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, BP_ZERO(&lr->lr_blkptr); itx->itx_private = ZTOZSB(zp); + itx->itx_gen = gen; if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) && (fsync_cnt == 0)) diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 0af03e9233b3..8229bc9a93e5 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -740,7 +740,8 @@ static void zfs_get_done(zgd_t *zgd, int error); * Get data to generate a TX_WRITE intent log record. */ int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, + struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; @@ -751,6 +752,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) dmu_buf_t *db; zgd_t *zgd; int error = 0; + uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); @@ -769,6 +771,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } + /* check if generation number matches */ + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (zp_gen)) != 0) { + zfs_zrele_async(zp); + return (SET_ERROR(EIO)); + } + if (zp_gen != gen) { + zfs_zrele_async(zp); + return (SET_ERROR(ENOENT)); + } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 7b52f9249298..d9c3042084e3 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1744,7 +1744,8 @@ cont: * completed after "lwb_write_zio" completed. */ error = zilog->zl_get_data(itx->itx_private, - lrwb, dbuf, lwb, lwb->lwb_write_zio); + itx->itx_gen, lrwb, dbuf, lwb, + lwb->lwb_write_zio); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 262ca24b1443..a7820e75670b 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -3950,7 +3950,7 @@ zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, /*ARGSUSED*/ void -zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) +zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); @@ -4288,7 +4288,7 @@ zio_checksum_verify(zio_t *zio) !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, - zio->io_offset, zio->io_size, NULL, &info); + zio->io_offset, zio->io_size, &info); mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 44f9832ce857..b6609363f047 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -673,7 +673,8 @@ zvol_get_done(zgd_t *zgd, int error) * Get data to generate a TX_WRITE intent log record. */ int -zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, + struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; |