aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os/linux
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux')
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-cred.c12
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-err.c5
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-generic.c163
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c49
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c9
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-proc.c77
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-shrinker.c115
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c110
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-thread.c8
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-trace.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c1
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-zone.c11
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c202
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/arc_os.c86
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/policy.c21
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/qat.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c18
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/trace.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c1090
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c56
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_label_os.c45
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c118
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c97
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c25
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c115
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c15
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c8
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c31
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c179
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c931
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c174
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c64
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c100
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c189
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c299
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c234
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c63
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c82
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c223
48 files changed, 3667 insertions, 1394 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
index d0461a9f1298..5898789ad53d 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
@@ -20,7 +20,7 @@
* You should have received a copy of the GNU General Public License along
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
*
- * Solaris Porting Layer (SPL) Credential Implementation.
+ * Solaris Porting Layer (SPL) Condition Variables Implementation.
*/
#include <sys/condvar.h>
@@ -37,7 +37,7 @@
#endif
#define MAX_HRTIMEOUT_SLACK_US 1000
-unsigned int spl_schedule_hrtimeout_slack_us = 0;
+static unsigned int spl_schedule_hrtimeout_slack_us = 0;
static int
param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
index f81b9540a639..d407fc66b2de 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
@@ -145,6 +145,18 @@ crgetgid(const cred_t *cr)
return (KGID_TO_SGID(cr->fsgid));
}
+/* Return the initial user ns or nop_mnt_idmap */
+zidmap_t *
+zfs_get_init_idmap(void)
+{
+#ifdef HAVE_IOPS_CREATE_IDMAP
+ return ((zidmap_t *)&nop_mnt_idmap);
+#else
+ return ((zidmap_t *)&init_user_ns);
+#endif
+}
+
+EXPORT_SYMBOL(zfs_get_init_idmap);
EXPORT_SYMBOL(crhold);
EXPORT_SYMBOL(crfree);
EXPORT_SYMBOL(crgetuid);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
index c84c39b56bf7..29781b9515b2 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
@@ -32,7 +32,7 @@
* analysis and other such goodies.
* But we would still default to the current default of not to do that.
*/
-unsigned int spl_panic_halt;
+static unsigned int spl_panic_halt;
/* CSTYLED */
module_param(spl_panic_halt, uint, 0644);
MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
@@ -45,7 +45,7 @@ spl_dumpstack(void)
}
EXPORT_SYMBOL(spl_dumpstack);
-int
+void
spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
{
const char *newfile;
@@ -75,7 +75,6 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
schedule();
/* Unreachable */
- return (1);
}
EXPORT_SYMBOL(spl_panic);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
index 5179100d1665..986db1518456 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -23,6 +23,7 @@
* Solaris Porting Layer (SPL) Generic Implementation.
*/
+#include <sys/isa_defs.h>
#include <sys/sysmacros.h>
#include <sys/systeminfo.h>
#include <sys/vmsystm.h>
@@ -47,6 +48,8 @@
#include <linux/mod_compat.h>
#include <sys/cred.h>
#include <sys/vnode.h>
+#include <sys/misc.h>
+#include <linux/mod_compat.h>
unsigned long spl_hostid = 0;
EXPORT_SYMBOL(spl_hostid);
@@ -59,10 +62,10 @@ proc_t p0;
EXPORT_SYMBOL(p0);
/*
- * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna
+ * xoshiro256++ 1.0 PRNG by David Blackman and Sebastiano Vigna
*
- * "Further scramblings of Marsaglia's xorshift generators"
- * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ * "Scrambled Linear Pseudorandom Number Generators∗"
+ * https://vigna.di.unimi.it/ftp/papers/ScrambledLinear.pdf
*
* random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
* is to provide bytes containing random numbers. It is mapped to /dev/urandom
@@ -74,66 +77,85 @@ EXPORT_SYMBOL(p0);
* free of atomic instructions.
*
* A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
- * to generate words larger than 128 bits will paradoxically be limited to
- * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1`
- * 128-bit words and selecting the first will implicitly select the second. If
+ * to generate words larger than 256 bits will paradoxically be limited to
+ * `2^256 - 1` possibilities. This is because we have a sequence of `2^256 - 1`
+ * 256-bit words and selecting the first will implicitly select the second. If
* a caller finds this behavior undesirable, random_get_bytes() should be used
* instead.
*
* XXX: Linux interrupt handlers that trigger within the critical section
- * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will
+ * formed by `s[3] = xp[3];` and `xp[0] = s[0];` and call this function will
* see the same numbers. Nothing in the code currently calls this in an
* interrupt handler, so this is considered to be okay. If that becomes a
* problem, we could create a set of per-cpu variables for interrupt handlers
* and use them when in_interrupt() from linux/preempt_mask.h evaluates to
* true.
*/
-void __percpu *spl_pseudo_entropy;
+static void __percpu *spl_pseudo_entropy;
/*
- * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed
- * file:
+ * rotl()/spl_rand_next()/spl_rand_jump() are copied from the following CC-0
+ * licensed file:
*
- * http://xorshift.di.unimi.it/xorshift128plus.c
+ * https://prng.di.unimi.it/xoshiro256plusplus.c
*/
+static inline uint64_t rotl(const uint64_t x, int k)
+{
+ return ((x << k) | (x >> (64 - k)));
+}
+
static inline uint64_t
spl_rand_next(uint64_t *s)
{
- uint64_t s1 = s[0];
- const uint64_t s0 = s[1];
- s[0] = s0;
- s1 ^= s1 << 23; // a
- s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
- return (s[1] + s0);
+ const uint64_t result = rotl(s[0] + s[3], 23) + s[0];
+
+ const uint64_t t = s[1] << 17;
+
+ s[2] ^= s[0];
+ s[3] ^= s[1];
+ s[1] ^= s[2];
+ s[0] ^= s[3];
+
+ s[2] ^= t;
+
+ s[3] = rotl(s[3], 45);
+
+ return (result);
}
static inline void
spl_rand_jump(uint64_t *s)
{
- static const uint64_t JUMP[] =
- { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
+ static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba,
+ 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
uint64_t s0 = 0;
uint64_t s1 = 0;
+ uint64_t s2 = 0;
+ uint64_t s3 = 0;
int i, b;
for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
for (b = 0; b < 64; b++) {
if (JUMP[i] & 1ULL << b) {
s0 ^= s[0];
s1 ^= s[1];
+ s2 ^= s[2];
+ s3 ^= s[3];
}
(void) spl_rand_next(s);
}
s[0] = s0;
s[1] = s1;
+ s[2] = s2;
+ s[3] = s3;
}
int
random_get_pseudo_bytes(uint8_t *ptr, size_t len)
{
- uint64_t *xp, s[2];
+ uint64_t *xp, s[4];
ASSERT(ptr);
@@ -141,6 +163,8 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len)
s[0] = xp[0];
s[1] = xp[1];
+ s[2] = xp[2];
+ s[3] = xp[3];
while (len) {
union {
@@ -152,12 +176,22 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len)
len -= i;
entropy.ui64 = spl_rand_next(s);
+ /*
+ * xoshiro256++ has low entropy lower bytes, so we copy the
+ * higher order bytes first.
+ */
while (i--)
+#ifdef _ZFS_BIG_ENDIAN
*ptr++ = entropy.byte[i];
+#else
+ *ptr++ = entropy.byte[7 - i];
+#endif
}
xp[0] = s[0];
xp[1] = s[1];
+ xp[2] = s[2];
+ xp[3] = s[3];
put_cpu_ptr(spl_pseudo_entropy);
@@ -220,8 +254,10 @@ __div_u64(uint64_t u, uint32_t v)
* replacements for libgcc-provided functions and will never be called
* directly.
*/
+#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#endif
/*
* Implementation of 64-bit unsigned division for 32-bit machines.
@@ -415,7 +451,9 @@ __aeabi_ldivmod(int64_t u, int64_t v)
EXPORT_SYMBOL(__aeabi_ldivmod);
#endif /* __arm || __arm__ */
+#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
+#endif
#endif /* BITS_PER_LONG */
@@ -458,7 +496,7 @@ int ddi_strto##type(const char *str, char **endptr, \
if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
base = 16; /* hex */ \
ptr += 2; \
- } else if (str[1] >= '0' && str[1] < 8) { \
+ } else if (str[1] >= '0' && str[1] < '8') { \
base = 8; /* octal */ \
ptr += 1; \
} else { \
@@ -517,6 +555,61 @@ ddi_copyin(const void *from, void *to, size_t len, int flags)
}
EXPORT_SYMBOL(ddi_copyin);
+#define define_spl_param(type, fmt) \
+int \
+spl_param_get_##type(char *buf, zfs_kernel_param_t *kp) \
+{ \
+ return (scnprintf(buf, PAGE_SIZE, fmt "\n", \
+ *(type *)kp->arg)); \
+} \
+int \
+spl_param_set_##type(const char *buf, zfs_kernel_param_t *kp) \
+{ \
+ return (kstrto##type(buf, 0, (type *)kp->arg)); \
+} \
+const struct kernel_param_ops spl_param_ops_##type = { \
+ .set = spl_param_set_##type, \
+ .get = spl_param_get_##type, \
+}; \
+EXPORT_SYMBOL(spl_param_get_##type); \
+EXPORT_SYMBOL(spl_param_set_##type); \
+EXPORT_SYMBOL(spl_param_ops_##type);
+
+define_spl_param(s64, "%lld")
+define_spl_param(u64, "%llu")
+
+/*
+ * Post a uevent to userspace whenever a new vdev adds to the pool. It is
+ * necessary to sync blkid information with udev, which zed daemon uses
+ * during device hotplug to identify the vdev.
+ */
+void
+spl_signal_kobj_evt(struct block_device *bdev)
+{
+#if defined(HAVE_BDEV_KOBJ) || defined(HAVE_PART_TO_DEV)
+#ifdef HAVE_BDEV_KOBJ
+ struct kobject *disk_kobj = bdev_kobj(bdev);
+#else
+ struct kobject *disk_kobj = &part_to_dev(bdev->bd_part)->kobj;
+#endif
+ if (disk_kobj) {
+ int ret = kobject_uevent(disk_kobj, KOBJ_CHANGE);
+ if (ret) {
+ pr_warn("ZFS: Sending event '%d' to kobject: '%s'"
+ " (%p): failed(ret:%d)\n", KOBJ_CHANGE,
+ kobject_name(disk_kobj), disk_kobj, ret);
+ }
+ }
+#else
+/*
+ * This is encountered if neither bdev_kobj() nor part_to_dev() is available
+ * in the kernel - likely due to an API change that needs to be chased down.
+ */
+#error "Unsupported kernel: unable to get struct kobj from bdev"
+#endif
+}
+EXPORT_SYMBOL(spl_signal_kobj_evt);
+
int
ddi_copyout(const void *from, void *to, size_t len, int flags)
{
@@ -705,28 +798,33 @@ spl_kvmem_init(void)
* initialize each of the per-cpu seeds so that the sequences generated on each
* CPU are guaranteed to never overlap in practice.
*/
-static void __init
+static int __init
spl_random_init(void)
{
- uint64_t s[2];
+ uint64_t s[4];
int i = 0;
- spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t),
+ spl_pseudo_entropy = __alloc_percpu(4 * sizeof (uint64_t),
sizeof (uint64_t));
+ if (!spl_pseudo_entropy)
+ return (-ENOMEM);
+
get_random_bytes(s, sizeof (s));
- if (s[0] == 0 && s[1] == 0) {
+ if (s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0) {
if (jiffies != 0) {
s[0] = jiffies;
s[1] = ~0 - jiffies;
+ s[2] = ~jiffies;
+ s[3] = jiffies - ~0;
} else {
- (void) memcpy(s, "improbable seed", sizeof (s));
+ (void) memcpy(s, "improbable seed", 16);
}
printk("SPL: get_random_bytes() returned 0 "
"when generating random seed. Setting initial seed to "
- "0x%016llx%016llx.\n", cpu_to_be64(s[0]),
- cpu_to_be64(s[1]));
+ "0x%016llx%016llx%016llx%016llx.\n", cpu_to_be64(s[0]),
+ cpu_to_be64(s[1]), cpu_to_be64(s[2]), cpu_to_be64(s[3]));
}
for_each_possible_cpu(i) {
@@ -736,7 +834,11 @@ spl_random_init(void)
wordp[0] = s[0];
wordp[1] = s[1];
+ wordp[2] = s[2];
+ wordp[3] = s[3];
}
+
+ return (0);
}
static void
@@ -757,7 +859,8 @@ spl_init(void)
{
int rc = 0;
- spl_random_init();
+ if ((rc = spl_random_init()))
+ goto out0;
if ((rc = spl_kvmem_init()))
goto out1;
@@ -800,6 +903,8 @@ out3:
out2:
spl_kvmem_fini();
out1:
+ spl_random_fini();
+out0:
return (rc);
}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
index ba4ca49a2ac9..42821ad60256 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -28,6 +28,7 @@
#include <sys/timer.h>
#include <sys/vmem.h>
#include <sys/wait.h>
+#include <sys/string.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/prefetch.h>
@@ -76,17 +77,6 @@ module_param(spl_kmem_cache_magazine_size, uint, 0444);
MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
"Default magazine size (2-256), set automatically (0)");
-/*
- * The default behavior is to report the number of objects remaining in the
- * cache. This allows the Linux VM to repeatedly reclaim objects from the
- * cache when memory is low satisfy other memory allocations. Alternately,
- * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
- * is reclaimed. This may increase the likelihood of out of memory events.
- */
-static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
-module_param(spl_kmem_cache_reclaim, uint, 0644);
-MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
-
static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
@@ -102,7 +92,8 @@ MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
* of 16K was determined to be optimal for architectures using 4K pages and
* to also work well on architecutres using larger 64K page sizes.
*/
-static unsigned int spl_kmem_cache_slab_limit = 16384;
+static unsigned int spl_kmem_cache_slab_limit =
+ SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE;
module_param(spl_kmem_cache_slab_limit, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
"Objects less than N bytes use the Linux slab");
@@ -151,7 +142,7 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
struct list_head spl_kmem_cache_list; /* List of caches */
struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
-taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */
+static taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */
static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
@@ -182,8 +173,11 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
* of that infrastructure we are responsible for incrementing it.
*/
if (current->reclaim_state)
+#ifdef HAVE_RECLAIM_STATE_RECLAIMED
+ current->reclaim_state->reclaimed += size >> PAGE_SHIFT;
+#else
current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
-
+#endif
vfree(ptr);
}
@@ -701,12 +695,12 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align,
skc->skc_magic = SKC_MAGIC;
skc->skc_name_size = strlen(name) + 1;
- skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
+ skc->skc_name = kmalloc(skc->skc_name_size, lflags);
if (skc->skc_name == NULL) {
kfree(skc);
return (NULL);
}
- strncpy(skc->skc_name, name, skc->skc_name_size);
+ strlcpy(skc->skc_name, name, skc->skc_name_size);
skc->skc_ctor = ctor;
skc->skc_dtor = dtor;
@@ -791,10 +785,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align,
} else {
unsigned long slabflags = 0;
- if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
- rc = EINVAL;
+ if (size > spl_kmem_cache_slab_limit)
goto out;
- }
#if defined(SLAB_USERCOPY)
/*
@@ -815,10 +807,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align,
skc->skc_linux_cache = kmem_cache_create(
skc->skc_name, size, align, slabflags, NULL);
#endif
- if (skc->skc_linux_cache == NULL) {
- rc = ENOMEM;
+ if (skc->skc_linux_cache == NULL)
goto out;
- }
}
down_write(&spl_kmem_cache_sem);
@@ -1016,10 +1006,20 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT((skc->skc_flags & KMC_SLAB) == 0);
- might_sleep();
+
*obj = NULL;
/*
+ * Since we can't sleep attempt an emergency allocation to satisfy
+ * the request. The only alterative is to fail the allocation but
+ * it's preferable try. The use of KM_NOSLEEP is expected to be rare.
+ */
+ if (flags & KM_NOSLEEP)
+ return (spl_emergency_alloc(skc, flags, obj));
+
+ might_sleep();
+
+ /*
* Before allocating a new slab wait for any reaping to complete and
* then return so the local magazine can be rechecked for new objects.
*/
@@ -1452,6 +1452,9 @@ spl_kmem_cache_init(void)
spl_kmem_cache_kmem_threads * 8, INT_MAX,
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ if (spl_kmem_cache_taskq == NULL)
+ return (-ENOMEM);
+
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
index c6d3c8f4413f..ad553a73a69e 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
@@ -32,6 +32,7 @@
#include <sys/vmem.h>
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
+#include <sys/string.h>
static kmutex_t kstat_module_lock;
static struct list_head kstat_module_list;
@@ -390,7 +391,7 @@ kstat_create_module(char *name)
module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
module->ksm_proc = pde;
- strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
+ strlcpy(module->ksm_name, name, KSTAT_STRLEN);
INIT_LIST_HEAD(&module->ksm_kstat_list);
list_add_tail(&module->ksm_module_list, &kstat_module_list);
@@ -479,8 +480,8 @@ kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
kpep->kpe_owner = NULL;
kpep->kpe_proc = NULL;
INIT_LIST_HEAD(&kpep->kpe_list);
- strncpy(kpep->kpe_module, module, KSTAT_STRLEN);
- strncpy(kpep->kpe_name, name, KSTAT_STRLEN);
+ strlcpy(kpep->kpe_module, module, sizeof (kpep->kpe_module));
+ strlcpy(kpep->kpe_name, name, sizeof (kpep->kpe_name));
}
EXPORT_SYMBOL(kstat_proc_entry_init);
@@ -514,7 +515,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
ksp->ks_crtime = gethrtime();
ksp->ks_snaptime = ksp->ks_crtime;
ksp->ks_instance = ks_instance;
- strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
+ strlcpy(ksp->ks_class, ks_class, sizeof (ksp->ks_class));
ksp->ks_type = ks_type;
ksp->ks_flags = ks_flags;
ksp->ks_update = kstat_default_update;
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
index 01f5619e1893..f0f929d3ce90 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -47,6 +47,10 @@ static unsigned long table_min = 0;
static unsigned long table_max = ~0;
static struct ctl_table_header *spl_header = NULL;
+#ifndef HAVE_REGISTER_SYSCTL_TABLE
+static struct ctl_table_header *spl_kmem = NULL;
+static struct ctl_table_header *spl_kstat = NULL;
+#endif
static struct proc_dir_entry *proc_spl = NULL;
static struct proc_dir_entry *proc_spl_kmem = NULL;
static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
@@ -624,6 +628,7 @@ static struct ctl_table spl_table[] = {
.mode = 0644,
.proc_handler = &proc_dohostid,
},
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
{
.procname = "kmem",
.mode = 0555,
@@ -634,9 +639,11 @@ static struct ctl_table spl_table[] = {
.mode = 0555,
.child = spl_kstat_table,
},
+#endif
{},
};
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
static struct ctl_table spl_dir[] = {
{
.procname = "spl",
@@ -648,21 +655,64 @@ static struct ctl_table spl_dir[] = {
static struct ctl_table spl_root[] = {
{
- .procname = "kernel",
- .mode = 0555,
- .child = spl_dir,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = spl_dir,
},
{}
};
+#endif
+
+static void spl_proc_cleanup(void)
+{
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+
+#ifndef HAVE_REGISTER_SYSCTL_TABLE
+ if (spl_kstat) {
+ unregister_sysctl_table(spl_kstat);
+ spl_kstat = NULL;
+ }
+ if (spl_kmem) {
+ unregister_sysctl_table(spl_kmem);
+ spl_kmem = NULL;
+ }
+#endif
+ if (spl_header) {
+ unregister_sysctl_table(spl_header);
+ spl_header = NULL;
+ }
+}
int
spl_proc_init(void)
{
int rc = 0;
+#ifdef HAVE_REGISTER_SYSCTL_TABLE
spl_header = register_sysctl_table(spl_root);
if (spl_header == NULL)
return (-EUNATCH);
+#else
+ spl_header = register_sysctl("kernel/spl", spl_table);
+ if (spl_header == NULL)
+ return (-EUNATCH);
+
+ spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table);
+ if (spl_kmem == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+ spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table);
+ if (spl_kstat == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+#endif
proc_spl = proc_mkdir("spl", NULL);
if (proc_spl == NULL) {
@@ -703,15 +753,8 @@ spl_proc_init(void)
goto out;
}
out:
- if (rc) {
- remove_proc_entry("kstat", proc_spl);
- remove_proc_entry("slab", proc_spl_kmem);
- remove_proc_entry("kmem", proc_spl);
- remove_proc_entry("taskq-all", proc_spl);
- remove_proc_entry("taskq", proc_spl);
- remove_proc_entry("spl", NULL);
- unregister_sysctl_table(spl_header);
- }
+ if (rc)
+ spl_proc_cleanup();
return (rc);
}
@@ -719,13 +762,5 @@ out:
void
spl_proc_fini(void)
{
- remove_proc_entry("kstat", proc_spl);
- remove_proc_entry("slab", proc_spl_kmem);
- remove_proc_entry("kmem", proc_spl);
- remove_proc_entry("taskq-all", proc_spl);
- remove_proc_entry("taskq", proc_spl);
- remove_proc_entry("spl", NULL);
-
- ASSERT(spl_header != NULL);
- unregister_sysctl_table(spl_header);
+ spl_proc_cleanup();
}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
index 81501460f04f..5e073950d61a 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -23,9 +23,9 @@
*/
#include <sys/list.h>
-#include <sys/mutex.h>
#include <sys/procfs_list.h>
#include <linux/proc_fs.h>
+#include <sys/mutex.h>
/*
* A procfs_list is a wrapper around a linked list which implements the seq_file
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-shrinker.c b/sys/contrib/openzfs/module/os/linux/spl/spl-shrinker.c
new file mode 100644
index 000000000000..d5c8da471cbb
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-shrinker.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Shrinker Implementation.
+ */
+
+#include <sys/kmem.h>
+#include <sys/shrinker.h>
+
+#ifdef HAVE_SINGLE_SHRINKER_CALLBACK
+/* 3.0-3.11: single shrink() callback, which we wrap to carry both functions */
+struct spl_shrinker_wrap {
+ struct shrinker shrinker;
+ spl_shrinker_cb countfunc;
+ spl_shrinker_cb scanfunc;
+};
+
+static int
+spl_shrinker_single_cb(struct shrinker *shrinker, struct shrink_control *sc)
+{
+ struct spl_shrinker_wrap *sw = (struct spl_shrinker_wrap *)shrinker;
+
+ if (sc->nr_to_scan != 0)
+ (void) sw->scanfunc(&sw->shrinker, sc);
+ return (sw->countfunc(&sw->shrinker, sc));
+}
+#endif
+
+struct shrinker *
+spl_register_shrinker(const char *name, spl_shrinker_cb countfunc,
+ spl_shrinker_cb scanfunc, int seek_cost)
+{
+ struct shrinker *shrinker;
+
+ /* allocate shrinker */
+#if defined(HAVE_SHRINKER_REGISTER)
+ /* 6.7: kernel will allocate the shrinker for us */
+ shrinker = shrinker_alloc(0, name);
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+ /* 3.12-6.6: we allocate the shrinker */
+ shrinker = kmem_zalloc(sizeof (struct shrinker), KM_SLEEP);
+#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
+ /* 3.0-3.11: allocate a wrapper */
+ struct spl_shrinker_wrap *sw =
+ kmem_zalloc(sizeof (struct spl_shrinker_wrap), KM_SLEEP);
+ shrinker = &sw->shrinker;
+#else
+ /* 2.x-2.6.22, or a newer shrinker API has been introduced. */
+#error "Unknown shrinker API"
+#endif
+
+ if (shrinker == NULL)
+ return (NULL);
+
+ /* set callbacks */
+#ifdef HAVE_SINGLE_SHRINKER_CALLBACK
+ sw->countfunc = countfunc;
+ sw->scanfunc = scanfunc;
+ shrinker->shrink = spl_shrinker_single_cb;
+#else
+ shrinker->count_objects = countfunc;
+ shrinker->scan_objects = scanfunc;
+#endif
+
+ /* set params */
+ shrinker->seeks = seek_cost;
+
+ /* register with kernel */
+#if defined(HAVE_SHRINKER_REGISTER)
+ shrinker_register(shrinker);
+#elif defined(HAVE_REGISTER_SHRINKER_VARARG)
+ register_shrinker(shrinker, name);
+#else
+ register_shrinker(shrinker);
+#endif
+
+ return (shrinker);
+}
+EXPORT_SYMBOL(spl_register_shrinker);
+
+void
+spl_unregister_shrinker(struct shrinker *shrinker)
+{
+#if defined(HAVE_SHRINKER_REGISTER)
+ shrinker_free(shrinker);
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+ unregister_shrinker(shrinker);
+ kmem_free(shrinker, sizeof (struct shrinker));
+#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
+ unregister_shrinker(shrinker);
+ kmem_free(shrinker, sizeof (struct spl_shrinker_wrap));
+#else
+#error "Unknown shrinker API"
+#endif
+}
+EXPORT_SYMBOL(spl_unregister_shrinker);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
index 0aab148975aa..c384b7b378c3 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
+static uint_t spl_taskq_thread_timeout_ms = 5000;
+/* BEGIN CSTYLED */
+module_param(spl_taskq_thread_timeout_ms, uint, 0644);
+/* END CSTYLED */
+MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
+ "Minimum idle threads exit interval for dynamic taskqs");
static int spl_taskq_thread_dynamic = 1;
module_param(spl_taskq_thread_dynamic, int, 0444);
@@ -46,8 +52,10 @@ module_param(spl_taskq_thread_priority, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_priority,
"Allow non-default priority for taskq threads");
-static int spl_taskq_thread_sequential = 4;
-module_param(spl_taskq_thread_sequential, int, 0644);
+static uint_t spl_taskq_thread_sequential = 4;
+/* BEGIN CSTYLED */
+module_param(spl_taskq_thread_sequential, uint, 0644);
+/* END CSTYLED */
MODULE_PARM_DESC(spl_taskq_thread_sequential,
"Create new taskq threads after N sequential tasks");
@@ -586,8 +594,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
ASSERT(tq->tq_nactive <= tq->tq_nthreads);
if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
/* Dynamic taskq may be able to spawn another thread */
- if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
- taskq_thread_spawn(tq) == 0)
+ if (taskq_thread_spawn(tq) == 0)
goto out;
}
@@ -621,11 +628,11 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
spin_unlock(&t->tqent_lock);
wake_up(&tq->tq_work_waitq);
-out:
+
/* Spawn additional taskq threads if required. */
if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
-
+out:
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
return (rc);
}
@@ -668,10 +675,11 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
spin_unlock(&t->tqent_lock);
-out:
+
/* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
+out:
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
return (rc);
}
@@ -696,9 +704,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
/* Dynamic taskq may be able to spawn another thread */
- if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
- taskq_thread_spawn(tq) == 0)
- goto out2;
+ if (taskq_thread_spawn(tq) == 0)
+ goto out;
flags |= TQ_FRONT;
}
@@ -734,11 +741,11 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
spin_unlock(&t->tqent_lock);
wake_up(&tq->tq_work_waitq);
-out:
+
/* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
-out2:
+out:
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
}
EXPORT_SYMBOL(taskq_dispatch_ent);
@@ -817,6 +824,7 @@ taskq_thread_spawn(taskq_t *tq)
if (!(tq->tq_flags & TASKQ_DYNAMIC))
return (0);
+ tq->lastspawnstop = jiffies;
if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
(tq->tq_flags & TASKQ_ACTIVE)) {
spawning = (++tq->tq_nspawn);
@@ -828,9 +836,9 @@ taskq_thread_spawn(taskq_t *tq)
}
/*
- * Threads in a dynamic taskq should only exit once it has been completely
- * drained and no other threads are actively servicing tasks. This prevents
- * threads from being created and destroyed more than is required.
+ * Threads in a dynamic taskq may exit once there is no more work to do.
+ * To prevent threads from being created and destroyed too often limit
+ * the exit rate to one per spl_taskq_thread_timeout_ms.
*
* The first thread is the thread list is treated as the primary thread.
* There is nothing special about the primary thread but in order to avoid
@@ -839,19 +847,22 @@ taskq_thread_spawn(taskq_t *tq)
static int
taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
{
- if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ ASSERT(!taskq_next_ent(tq));
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic)
return (0);
-
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ return (1);
if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
tqt_thread_list) == tqt)
return (0);
-
- return
- ((tq->tq_nspawn == 0) && /* No threads are being spawned */
- (tq->tq_nactive == 0) && /* No threads are handling tasks */
- (tq->tq_nthreads > 1) && /* More than 1 thread is running */
- (!taskq_next_ent(tq)) && /* There are no pending tasks */
- (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+ ASSERT3U(tq->tq_nthreads, >, 1);
+ if (tq->tq_nspawn != 0)
+ return (0);
+ if (time_before(jiffies, tq->lastspawnstop +
+ msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
+ return (0);
+ tq->lastspawnstop = jiffies;
+ return (1);
}
static int
@@ -902,10 +913,8 @@ taskq_thread(void *args)
if (list_empty(&tq->tq_pend_list) &&
list_empty(&tq->tq_prio_list)) {
- if (taskq_thread_should_stop(tq, tqt)) {
- wake_up_all(&tq->tq_wait_waitq);
+ if (taskq_thread_should_stop(tq, tqt))
break;
- }
add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
spin_unlock_irqrestore(&tq->tq_lock, flags);
@@ -980,9 +989,6 @@ taskq_thread(void *args)
tqt->tqt_id = TASKQID_INVALID;
tqt->tqt_flags = 0;
wake_up_all(&tq->tq_wait_waitq);
- } else {
- if (taskq_thread_should_stop(tq, tqt))
- break;
}
set_current_state(TASK_INTERRUPTIBLE);
@@ -1046,7 +1052,6 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
ASSERT(name != NULL);
ASSERT(minalloc >= 0);
- ASSERT(maxalloc <= INT_MAX);
ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
/* Scale the number of threads using nthreads as a percentage */
@@ -1090,6 +1095,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
tq->tq_flags = (flags | TASKQ_ACTIVE);
tq->tq_next_id = TASKQID_INITIAL;
tq->tq_lowest_id = TASKQID_INITIAL;
+ tq->lastspawnstop = jiffies;
INIT_LIST_HEAD(&tq->tq_free_list);
INIT_LIST_HEAD(&tq->tq_pend_list);
INIT_LIST_HEAD(&tq->tq_prio_list);
@@ -1229,6 +1235,42 @@ taskq_destroy(taskq_t *tq)
}
EXPORT_SYMBOL(taskq_destroy);
+/*
+ * Create a taskq with a specified number of pool threads. Allocate
+ * and return an array of nthreads kthread_t pointers, one for each
+ * thread in the pool. The array is not ordered and must be freed
+ * by the caller.
+ */
+taskq_t *
+taskq_create_synced(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
+{
+ taskq_t *tq;
+ taskq_thread_t *tqt;
+ int i = 0;
+ kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
+ KM_SLEEP);
+
+ flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
+
+ /* taskq_create spawns all the threads before returning */
+ tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
+ flags | TASKQ_PREPOPULATE);
+ VERIFY(tq != NULL);
+ VERIFY(tq->tq_nthreads == nthreads);
+
+ list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) {
+ kthreads[i] = tqt->tqt_thread;
+ i++;
+ }
+
+ ASSERT3S(i, ==, nthreads);
+ *ktpp = kthreads;
+
+ return (tq);
+}
+EXPORT_SYMBOL(taskq_create_synced);
+
static unsigned int spl_taskq_kick = 0;
/*
@@ -1379,7 +1421,7 @@ spl_taskq_init(void)
system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
if (system_taskq == NULL)
- return (1);
+ return (-ENOMEM);
system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
@@ -1388,7 +1430,7 @@ spl_taskq_init(void)
cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
#endif
taskq_destroy(system_taskq);
- return (1);
+ return (-ENOMEM);
}
dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
@@ -1399,7 +1441,7 @@ spl_taskq_init(void)
#endif
taskq_destroy(system_taskq);
taskq_destroy(system_delay_taskq);
- return (1);
+ return (-ENOMEM);
}
/*
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
index 32a2d34b1d93..ee3eb4690c3a 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@@ -26,6 +26,7 @@
#include <sys/thread.h>
#include <sys/kmem.h>
#include <sys/tsd.h>
+#include <sys/string.h>
/*
* Thread interfaces
@@ -92,7 +93,7 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func,
return (NULL);
}
- strncpy(tp->tp_name, name, tp->tp_name_size);
+ strlcpy(tp->tp_name, name, tp->tp_name_size);
/*
* Strip trailing "_thread" from passed name which will be the func
@@ -178,12 +179,11 @@ issig(int why)
sigorsets(&set, &task->blocked, &set);
spin_lock_irq(&task->sighand->siglock);
- int ret;
#ifdef HAVE_DEQUEUE_SIGNAL_4ARG
enum pid_type __type;
- if ((ret = dequeue_signal(task, &set, &__info, &__type)) != 0) {
+ if (dequeue_signal(task, &set, &__info, &__type) != 0) {
#else
- if ((ret = dequeue_signal(task, &set, &__info)) != 0) {
+ if (dequeue_signal(task, &set, &__info) != 0) {
#endif
#ifdef HAVE_SIGNAL_STOP
spin_unlock_irq(&task->sighand->siglock);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
index 7912a381294d..d3e53e541b8b 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
index 546db9ab8bd7..389c9d0d6df3 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
@@ -706,7 +706,7 @@ spl_tsd_init(void)
{
tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
if (tsd_hash_table == NULL)
- return (1);
+ return (-ENOMEM);
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
index 6b77524181db..e1773da5d173 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
@@ -25,6 +25,7 @@
#include <sys/debug.h>
#include <sys/types.h>
#include <sys/sysmacros.h>
+#include <rpc/types.h>
#include <rpc/xdr.h>
/*
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
index 589496da0c78..8c6282ee5d16 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
@@ -204,7 +204,7 @@ spl_zlib_init(void)
size, 0, NULL, NULL, NULL, NULL, NULL,
KMC_KVMEM);
if (!zlib_workspace_cache)
- return (1);
+ return (-ENOMEM);
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
index b8a8b7cd8cd8..d0d0cca154a7 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
@@ -25,18 +25,20 @@
*/
#include <sys/types.h>
-#include <sys/mutex.h>
#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <linux/file.h>
#include <linux/magic.h>
#include <sys/zone.h>
+#include <sys/string.h>
#if defined(CONFIG_USER_NS)
#include <linux/statfs.h>
#include <linux/proc_ns.h>
#endif
+#include <sys/mutex.h>
+
static kmutex_t zone_datasets_lock;
static struct list_head zone_datasets;
@@ -49,7 +51,7 @@ typedef struct zone_datasets {
typedef struct zone_dataset {
struct list_head zd_list; /* zone_dataset linkage */
size_t zd_dsnamelen; /* length of name */
- char zd_dsname[0]; /* name of the member dataset */
+ char zd_dsname[]; /* name of the member dataset */
} zone_dataset_t;
#if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
@@ -203,8 +205,7 @@ zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
zd->zd_dsnamelen = dsnamelen;
- strncpy(zd->zd_dsname, dataset, dsnamelen);
- zd->zd_dsname[dsnamelen] = '\0';
+ strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
INIT_LIST_HEAD(&zd->zd_list);
list_add_tail(&zd->zd_list, &zds->zds_datasets);
@@ -415,8 +416,8 @@ spl_zone_fini(void)
zone_dataset_t, zd_list);
list_del(&zd->zd_list);
kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
- put_user_ns(zds->zds_userns);
}
+ put_user_ns(zds->zds_userns);
list_del(&zds->zds_list);
kmem_free(zds, sizeof (*zds));
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index 0cd4fa5213d4..cee7410c8833 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
*/
/*
@@ -59,9 +60,19 @@
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
+#include <linux/mm_compat.h>
#include <linux/scatterlist.h>
+#include <linux/version.h>
+#endif
+
+#ifdef _KERNEL
+#if defined(MAX_ORDER)
+#define ABD_MAX_ORDER (MAX_ORDER)
+#elif defined(MAX_PAGE_ORDER)
+#define ABD_MAX_ORDER (MAX_PAGE_ORDER)
+#endif
#else
-#define MAX_ORDER 1
+#define ABD_MAX_ORDER (1)
#endif
typedef struct abd_stats {
@@ -71,7 +82,7 @@ typedef struct abd_stats {
kstat_named_t abdstat_scatter_cnt;
kstat_named_t abdstat_scatter_data_size;
kstat_named_t abdstat_scatter_chunk_waste;
- kstat_named_t abdstat_scatter_orders[MAX_ORDER];
+ kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER];
kstat_named_t abdstat_scatter_page_multi_chunk;
kstat_named_t abdstat_scatter_page_multi_zone;
kstat_named_t abdstat_scatter_page_alloc_retry;
@@ -132,14 +143,14 @@ static abd_stats_t abd_stats = {
{ "scatter_sg_table_retry", KSTAT_DATA_UINT64 },
};
-struct {
+static struct {
wmsum_t abdstat_struct_size;
wmsum_t abdstat_linear_cnt;
wmsum_t abdstat_linear_data_size;
wmsum_t abdstat_scatter_cnt;
wmsum_t abdstat_scatter_data_size;
wmsum_t abdstat_scatter_chunk_waste;
- wmsum_t abdstat_scatter_orders[MAX_ORDER];
+ wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER];
wmsum_t abdstat_scatter_page_multi_chunk;
wmsum_t abdstat_scatter_page_multi_zone;
wmsum_t abdstat_scatter_page_alloc_retry;
@@ -222,7 +233,7 @@ abd_free_struct_impl(abd_t *abd)
}
#ifdef _KERNEL
-static unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
+static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
/*
* Mark zfs data pages so they can be excluded from kernel crash dumps
@@ -272,18 +283,21 @@ abd_alloc_chunks(abd_t *abd, size_t size)
struct page *page, *tmp_page = NULL;
gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
- int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
- int nr_pages = abd_chunkcnt_for_bytes(size);
- int chunks = 0, zones = 0;
+ unsigned int max_order = MIN(zfs_abd_scatter_max_order,
+ ABD_MAX_ORDER - 1);
+ unsigned int nr_pages = abd_chunkcnt_for_bytes(size);
+ unsigned int chunks = 0, zones = 0;
size_t remaining_size;
int nid = NUMA_NO_NODE;
- int alloc_pages = 0;
+ unsigned int alloc_pages = 0;
INIT_LIST_HEAD(&pages);
+ ASSERT3U(alloc_pages, <, nr_pages);
+
while (alloc_pages < nr_pages) {
- unsigned chunk_pages;
- int order;
+ unsigned int chunk_pages;
+ unsigned int order;
order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
chunk_pages = (1U << order);
@@ -597,10 +611,8 @@ abd_free_chunks(abd_t *abd)
struct scatterlist *sg;
abd_for_each_sg(abd, sg, n, i) {
- for (int j = 0; j < sg->length; j += PAGESIZE) {
- struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
- umem_free(p, PAGESIZE);
- }
+ struct page *p = nth_page(sg_page(sg), 0);
+ umem_free_aligned(p, PAGESIZE);
}
abd_free_sg_table(abd);
}
@@ -706,7 +718,7 @@ abd_free_zero_scatter(void)
__free_page(abd_zero_page);
#endif /* HAVE_ZERO_PAGE_GPL_ONLY */
#else
- umem_free(abd_zero_page, PAGESIZE);
+ umem_free_aligned(abd_zero_page, PAGESIZE);
#endif /* _KERNEL */
}
@@ -729,7 +741,7 @@ abd_kstats_update(kstat_t *ksp, int rw)
wmsum_value(&abd_sums.abdstat_scatter_data_size);
as->abdstat_scatter_chunk_waste.value.ui64 =
wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
- for (int i = 0; i < MAX_ORDER; i++) {
+ for (int i = 0; i < ABD_MAX_ORDER; i++) {
as->abdstat_scatter_orders[i].value.ui64 =
wmsum_value(&abd_sums.abdstat_scatter_orders[i]);
}
@@ -758,7 +770,7 @@ abd_init(void)
wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
- for (i = 0; i < MAX_ORDER; i++)
+ for (i = 0; i < ABD_MAX_ORDER; i++)
wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0);
wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0);
wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0);
@@ -768,7 +780,7 @@ abd_init(void)
abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
if (abd_ksp != NULL) {
- for (i = 0; i < MAX_ORDER; i++) {
+ for (i = 0; i < ABD_MAX_ORDER; i++) {
snprintf(abd_stats.abdstat_scatter_orders[i].name,
KSTAT_STRLEN, "scatter_order_%d", i);
abd_stats.abdstat_scatter_orders[i].data_type =
@@ -798,7 +810,7 @@ abd_fini(void)
wmsum_fini(&abd_sums.abdstat_scatter_cnt);
wmsum_fini(&abd_sums.abdstat_scatter_data_size);
wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
- for (int i = 0; i < MAX_ORDER; i++)
+ for (int i = 0; i < ABD_MAX_ORDER; i++)
wmsum_fini(&abd_sums.abdstat_scatter_orders[i]);
wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk);
wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone);
@@ -886,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
+ memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
- aiter->iter_mapaddr = NULL;
- aiter->iter_mapsize = 0;
- aiter->iter_pos = 0;
- if (abd_is_linear(abd)) {
- aiter->iter_offset = 0;
- aiter->iter_sg = NULL;
- } else {
+ if (!abd_is_linear(abd)) {
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
}
@@ -906,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
+ ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}
@@ -917,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter)
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
+ /*
+ * Ensure that last chunk is not in use. abd_iterate_*() must clear
+ * this state (directly or abd_iter_unmap()) before advancing.
+ */
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
+ ASSERT3P(aiter->iter_page, ==, NULL);
+ ASSERT0(aiter->iter_page_doff);
+ ASSERT0(aiter->iter_page_dsize);
/* There's nothing left to advance to, so do nothing */
if (abd_iter_at_end(aiter))
@@ -1000,6 +1015,134 @@ abd_cache_reap_now(void)
}
#if defined(_KERNEL)
+
+/*
+ * This is abd_iter_page(), the function underneath abd_iterate_page_func().
+ * It yields the next page struct and data offset and size within it, without
+ * mapping it into the address space.
+ */
+
+/*
+ * "Compound pages" are a group of pages that can be referenced from a single
+ * struct page *. Its organised as a "head" page, followed by a series of
+ * "tail" pages.
+ *
+ * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
+ * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
+ * great many of the IO buffers we get are going to be of this type.
+ *
+ * The tail pages are just regular PAGESIZE pages, and can be safely used
+ * as-is. However, the head page has length covering itself and all the tail
+ * pages. If the ABD chunk spans multiple pages, then we can use the head page
+ * and a >PAGESIZE length, which is far more efficient.
+ *
+ * Before kernel 4.5 however, compound page heads were refcounted separately
+ * from tail pages, such that moving back to the head page would require us to
+ * take a reference to it and releasing it once we're completely finished with
+ * it. In practice, that means when our caller is done with the ABD, which we
+ * have no insight into from here. Rather than contort this API to track head
+ * page references on such ancient kernels, we disable this special compound
+ * page handling on 4.5, instead just using treating each page within it as a
+ * regular PAGESIZE page (which it is). This is slightly less efficient, but
+ * makes everything far simpler.
+ *
+ * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the
+ * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to
+ * understand compound pages, or not, as required.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
+#define ABD_ITER_COMPOUND_PAGES 1
+#define ABD_ITER_PAGE_SIZE(page) \
+ (PageCompound(page) ? page_size(page) : PAGESIZE)
+#else
+#undef ABD_ITER_COMPOUND_PAGES
+#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE)
+#endif
+
+void
+abd_iter_page(struct abd_iter *aiter)
+{
+ if (abd_iter_at_end(aiter)) {
+ aiter->iter_page = NULL;
+ aiter->iter_page_doff = 0;
+ aiter->iter_page_dsize = 0;
+ return;
+ }
+
+ struct page *page;
+ size_t doff, dsize;
+
+ /*
+ * Find the page, and the start of the data within it. This is computed
+ * differently for linear and scatter ABDs; linear is referenced by
+ * virtual memory location, while scatter is referenced by page
+ * pointer.
+ */
+ if (abd_is_linear(aiter->iter_abd)) {
+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+
+ /* memory address at iter_pos */
+ void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+
+ /* struct page for address */
+ page = is_vmalloc_addr(paddr) ?
+ vmalloc_to_page(paddr) : virt_to_page(paddr);
+
+ /* offset of address within the page */
+ doff = offset_in_page(paddr);
+ } else {
+ ASSERT(!abd_is_gang(aiter->iter_abd));
+
+ /* current scatter page */
+ page = nth_page(sg_page(aiter->iter_sg),
+ aiter->iter_offset >> PAGE_SHIFT);
+
+ /* position within page */
+ doff = aiter->iter_offset & (PAGESIZE - 1);
+ }
+
+#ifdef ABD_ITER_COMPOUND_PAGES
+ if (PageTail(page)) {
+ /*
+ * If this is a compound tail page, move back to the head, and
+ * adjust the offset to match. This may let us yield a much
+ * larger amount of data from a single logical page, and so
+ * leave our caller with fewer pages to process.
+ */
+ struct page *head = compound_head(page);
+ doff += ((page - head) * PAGESIZE);
+ page = head;
+ }
+#endif
+
+ ASSERT(page);
+
+ /*
+ * Compute the maximum amount of data we can take from this page. This
+ * is the smaller of:
+ * - the remaining space in the page
+ * - the remaining space in this scatterlist entry (which may not cover
+ * the entire page)
+ * - the remaining space in the abd (which may not cover the entire
+ * scatterlist entry)
+ */
+ dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
+ if (!abd_is_linear(aiter->iter_abd))
+ dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
+ ASSERT3U(dsize, >, 0);
+
+ /* final iterator outputs */
+ aiter->iter_page = page;
+ aiter->iter_page_doff = doff;
+ aiter->iter_page_dsize = dsize;
+}
+
+/*
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
+ * vdev_disk.c.
+ */
+
/*
* bio_nr_pages for ABD.
* @off is the offset in @abd
@@ -1154,4 +1297,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
-#endif
+
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
index a95e9c334af9..02dd80c06062 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -80,12 +80,18 @@ static struct notifier_block arc_hotplug_callback_mem_nb;
/*
* Return a default max arc size based on the amount of physical memory.
+ * This may be overridden by tuning the zfs_arc_max module parameter.
*/
uint64_t
arc_default_max(uint64_t min, uint64_t allmem)
{
- /* Default to 1/2 of all memory. */
- return (MAX(allmem / 2, min));
+ uint64_t size;
+
+ if (allmem >= 1 << 30)
+ size = allmem - (1 << 30);
+ else
+ size = min;
+ return (MAX(allmem * 5 / 8, size));
}
#ifdef _KERNEL
@@ -219,7 +225,11 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
arc_reduce_target_size(ptob(sc->nr_to_scan));
arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
if (current->reclaim_state != NULL)
+#ifdef HAVE_RECLAIM_STATE_RECLAIMED
+ current->reclaim_state->reclaimed += sc->nr_to_scan;
+#else
current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
+#endif
/*
* We are experiencing memory pressure which the arc_evict_zthr was
@@ -243,8 +253,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
return (sc->nr_to_scan);
}
-SPL_SHRINKER_DECLARE(arc_shrinker,
- arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+static struct shrinker *arc_shrinker = NULL;
int
arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
@@ -347,22 +356,26 @@ arc_lowmem_init(void)
* reclaim from the arc. This is done to prevent kswapd from
* swapping out pages when it is preferable to shrink the arc.
*/
- spl_register_shrinker(&arc_shrinker);
+ arc_shrinker = spl_register_shrinker("zfs-arc-shrinker",
+ arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+ VERIFY(arc_shrinker);
+
arc_set_sys_free(allmem);
}
void
arc_lowmem_fini(void)
{
- spl_unregister_shrinker(&arc_shrinker);
+ spl_unregister_shrinker(arc_shrinker);
+ arc_shrinker = NULL;
}
int
-param_set_arc_long(const char *buf, zfs_kernel_param_t *kp)
+param_set_arc_u64(const char *buf, zfs_kernel_param_t *kp)
{
int error;
- error = param_set_long(buf, kp);
+ error = spl_param_set_u64(buf, kp);
if (error < 0)
return (SET_ERROR(error));
@@ -374,13 +387,13 @@ param_set_arc_long(const char *buf, zfs_kernel_param_t *kp)
int
param_set_arc_min(const char *buf, zfs_kernel_param_t *kp)
{
- return (param_set_arc_long(buf, kp));
+ return (param_set_arc_u64(buf, kp));
}
int
param_set_arc_max(const char *buf, zfs_kernel_param_t *kp)
{
- return (param_set_arc_long(buf, kp));
+ return (param_set_arc_u64(buf, kp));
}
int
@@ -485,56 +498,5 @@ arc_unregister_hotplug(void)
}
#endif /* _KERNEL */
-/*
- * Helper function for arc_prune_async() it is responsible for safely
- * handling the execution of a registered arc_prune_func_t.
- */
-static void
-arc_prune_task(void *ptr)
-{
- arc_prune_t *ap = (arc_prune_t *)ptr;
- arc_prune_func_t *func = ap->p_pfunc;
-
- if (func != NULL)
- func(ap->p_adjust, ap->p_private);
-
- zfs_refcount_remove(&ap->p_refcnt, func);
-}
-
-/*
- * Notify registered consumers they must drop holds on a portion of the ARC
- * buffered they reference. This provides a mechanism to ensure the ARC can
- * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This
- * is analogous to dnlc_reduce_cache() but more generic.
- *
- * This operation is performed asynchronously so it may be safely called
- * in the context of the arc_reclaim_thread(). A reference is taken here
- * for each registered arc_prune_t and the arc_prune_task() is responsible
- * for releasing it once the registered arc_prune_func_t has completed.
- */
-void
-arc_prune_async(int64_t adjust)
-{
- arc_prune_t *ap;
-
- mutex_enter(&arc_prune_mtx);
- for (ap = list_head(&arc_prune_list); ap != NULL;
- ap = list_next(&arc_prune_list, ap)) {
-
- if (zfs_refcount_count(&ap->p_refcnt) >= 2)
- continue;
-
- zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
- ap->p_adjust = adjust;
- if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
- ap, TQ_SLEEP) == TASKQID_INVALID) {
- zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
- continue;
- }
- ARCSTAT_BUMP(arcstat_prune);
- }
- mutex_exit(&arc_prune_mtx);
-}
-
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
"Limit on number of pages that ARC shrinker can reclaim at once");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
index ff3ef1bf6ad9..7e5bd392437e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -30,7 +30,7 @@ param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
{
int ret;
- ret = param_set_ulong(val, kp);
+ ret = spl_param_set_u64(val, kp);
if (ret < 0)
return (ret);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
index ab00d2ae14d2..5d1b4383412a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -124,7 +124,7 @@ secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
if (crgetuid(cr) == owner)
return (0);
- if (zpl_inode_owner_or_capable(kcred->user_ns, ip))
+ if (zpl_inode_owner_or_capable(zfs_init_idmap, ip))
return (0);
#if defined(CONFIG_USER_NS)
@@ -214,8 +214,10 @@ secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr,
* Determine that subject can set the file setgid flag.
*/
int
-secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid)
+secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid, zidmap_t *mnt_ns,
+ struct user_namespace *fs_ns)
{
+ gid = zfs_gid_to_vfsgid(mnt_ns, fs_ns, gid);
#if defined(CONFIG_USER_NS)
if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid)))
return (EPERM);
@@ -284,8 +286,11 @@ secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
* Determine that subject can set the file setid flags.
*/
static int
-secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner)
+secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner, zidmap_t *mnt_ns,
+ struct user_namespace *fs_ns)
{
+ owner = zfs_uid_to_vfsuid(mnt_ns, fs_ns, owner);
+
if (crgetuid(cr) == owner)
return (0);
@@ -310,13 +315,14 @@ secpolicy_vnode_stky_modify(const cred_t *cr)
int
secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
- const vattr_t *ovap, cred_t *cr)
+ const vattr_t *ovap, cred_t *cr, zidmap_t *mnt_ns,
+ struct user_namespace *fs_ns)
{
int error;
if ((vap->va_mode & S_ISUID) != 0 &&
(error = secpolicy_vnode_setid_modify(cr,
- ovap->va_uid)) != 0) {
+ ovap->va_uid, mnt_ns, fs_ns)) != 0) {
return (error);
}
@@ -334,7 +340,8 @@ secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
* group-id bit.
*/
if ((vap->va_mode & S_ISGID) != 0 &&
- secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) {
+ secpolicy_vnode_setids_setgids(cr, ovap->va_gid,
+ mnt_ns, fs_ns) != 0) {
vap->va_mode &= ~S_ISGID;
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat.c b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
index 08613b3a2042..07e0cafabb0e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/qat.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
index 1d099c95bc7c..6d0595dd5f76 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -193,7 +193,9 @@ qat_dc_init(void)
sd.huffType = CPA_DC_HT_FULL_DYNAMIC;
sd.sessDirection = CPA_DC_DIR_COMBINED;
sd.sessState = CPA_DC_STATELESS;
+#if (CPA_DC_API_VERSION_NUM_MAJOR == 1 && CPA_DC_API_VERSION_NUM_MINOR < 6)
sd.deflateWindowSize = 7;
+#endif
sd.checksum = CPA_DC_ADLER32;
status = cpaDcGetSessionSize(dc_inst_handles[i],
&sd, &sess_size, &ctx_size);
@@ -247,7 +249,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
Cpa8U *buffer_meta_src = NULL;
Cpa8U *buffer_meta_dst = NULL;
Cpa32U buffer_meta_size = 0;
- CpaDcRqResults dc_results;
+ CpaDcRqResults dc_results = {.checksum = 1};
CpaStatus status = CPA_STATUS_FAIL;
Cpa32U hdr_sz = 0;
Cpa32U compressed_sz;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
index 18b6e38d1a6e..0523a23c61e1 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
index cbdc0f350ad8..c8cbedcd5157 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -60,7 +60,7 @@ param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp)
{
int error;
- error = param_set_ulong(val, kp);
+ error = spl_param_set_u64(val, kp);
if (error < 0)
return (SET_ERROR(error));
@@ -74,7 +74,7 @@ param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp)
{
int error;
- error = param_set_ulong(val, kp);
+ error = spl_param_set_u64(val, kp);
if (error < 0)
return (SET_ERROR(error));
@@ -103,6 +103,18 @@ param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp)
return (0);
}
+int
+param_set_active_allocator(const char *val, zfs_kernel_param_t *kp)
+{
+ int error;
+
+ error = -param_set_active_allocator_common(val);
+ if (error == 0)
+ error = param_set_charp(val, kp);
+
+ return (error);
+}
+
const char *
spa_history_zone(void)
{
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/trace.c b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
index a690822ae14c..32a188d169e3 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/trace.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 9a382261df73..2cea61a6294c 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -24,6 +24,7 @@
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/zfs_context.h>
@@ -41,12 +42,49 @@
#include <linux/blk-cgroup.h>
#endif
+/*
+ * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
+ * block_device. Since it carries the block_device inside, its convenient to
+ * just use the handle as a proxy.
+ *
+ * Linux 6.9.x uses a file for the same purpose.
+ *
+ * For pre-6.8, we just emulate this with a cast, since we don't need any of
+ * the other fields inside the handle.
+ */
+#if defined(HAVE_BDEV_OPEN_BY_PATH)
+typedef struct bdev_handle zfs_bdev_handle_t;
+#define BDH_BDEV(bdh) ((bdh)->bdev)
+#define BDH_IS_ERR(bdh) (IS_ERR(bdh))
+#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh))
+#define BDH_ERR_PTR(err) (ERR_PTR(err))
+#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+typedef struct file zfs_bdev_handle_t;
+#define BDH_BDEV(bdh) (file_bdev(bdh))
+#define BDH_IS_ERR(bdh) (IS_ERR(bdh))
+#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh))
+#define BDH_ERR_PTR(err) (ERR_PTR(err))
+#else
+typedef void zfs_bdev_handle_t;
+#define BDH_BDEV(bdh) ((struct block_device *)bdh)
+#define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh)))
+#define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh)))
+#define BDH_ERR_PTR(err) (ERR_PTR(err))
+#endif
+
typedef struct vdev_disk {
- struct block_device *vd_bdev;
+ zfs_bdev_handle_t *vd_bdh;
krwlock_t vd_lock;
} vdev_disk_t;
/*
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
+ * the maximum allowed by the device queue or the kernel itself, it will be
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
+ */
+uint_t zfs_vdev_disk_max_segs = 0;
+
+/*
* Unique identifier for the exclusive vdev holder.
*/
static void *zfs_vdev_holder = VDEV_HOLDER;
@@ -56,7 +94,7 @@ static void *zfs_vdev_holder = VDEV_HOLDER;
* device is missing. The missing path may be transient since the links
* can be briefly removed and recreated in response to udev events.
*/
-static unsigned zfs_vdev_open_timeout_ms = 1000;
+static uint_t zfs_vdev_open_timeout_ms = 1000;
/*
* Size of the "reserved" partition, in blocks.
@@ -64,28 +102,46 @@ static unsigned zfs_vdev_open_timeout_ms = 1000;
#define EFI_MIN_RESV_SIZE (16 * 1024)
/*
- * Virtual device vector for disks.
+ * BIO request failfast mask.
*/
-typedef struct dio_request {
- zio_t *dr_zio; /* Parent ZIO */
- atomic_t dr_ref; /* References */
- int dr_error; /* Bio error */
- int dr_bio_count; /* Count of bio's */
- struct bio *dr_bio[0]; /* Attached bio's */
-} dio_request_t;
-static fmode_t
-vdev_bdev_mode(spa_mode_t spa_mode)
+static unsigned int zfs_vdev_failfast_mask = 1;
+
+/*
+ * Convert SPA mode flags into bdev open mode flags.
+ */
+#ifdef HAVE_BLK_MODE_T
+typedef blk_mode_t vdev_bdev_mode_t;
+#define VDEV_BDEV_MODE_READ BLK_OPEN_READ
+#define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE
+#define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL
+#define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
+#else
+typedef fmode_t vdev_bdev_mode_t;
+#define VDEV_BDEV_MODE_READ FMODE_READ
+#define VDEV_BDEV_MODE_WRITE FMODE_WRITE
+#define VDEV_BDEV_MODE_EXCL FMODE_EXCL
+#define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL)
+#endif
+
+static vdev_bdev_mode_t
+vdev_bdev_mode(spa_mode_t smode)
{
- fmode_t mode = 0;
+ ASSERT3U(smode, !=, SPA_MODE_UNINIT);
+ ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
- if (spa_mode & SPA_MODE_READ)
- mode |= FMODE_READ;
+ vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
- if (spa_mode & SPA_MODE_WRITE)
- mode |= FMODE_WRITE;
+ if (smode & SPA_MODE_READ)
+ bmode |= VDEV_BDEV_MODE_READ;
- return (mode);
+ if (smode & SPA_MODE_WRITE)
+ bmode |= VDEV_BDEV_MODE_WRITE;
+
+ ASSERT(bmode & VDEV_BDEV_MODE_MASK);
+ ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
+
+ return (bmode);
}
/*
@@ -105,6 +161,16 @@ bdev_whole(struct block_device *bdev)
}
#endif
+#if defined(HAVE_BDEVNAME)
+#define vdev_bdevname(bdev, name) bdevname(bdev, name)
+#else
+static inline void
+vdev_bdevname(struct block_device *bdev, char *name)
+{
+ snprintf(name, BDEVNAME_SIZE, "%pg", bdev);
+}
+#endif
+
/*
* Returns the maximum expansion capacity of the block device (in bytes).
*
@@ -163,18 +229,60 @@ vdev_disk_error(zio_t *zio)
* which is safe from any context.
*/
printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
- "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
+ "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa),
zio->io_vd->vdev_path, zio->io_error, zio->io_type,
(u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
zio->io_flags);
}
+static void
+vdev_disk_kobj_evt_post(vdev_t *v)
+{
+ vdev_disk_t *vd = v->vdev_tsd;
+ if (vd && vd->vd_bdh) {
+ spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh));
+ } else {
+ vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n",
+ v->vdev_path);
+ }
+}
+
+static zfs_bdev_handle_t *
+vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
+{
+ vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
+
+#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
+ return (bdev_file_open_by_path(path, bmode, holder, NULL));
+#elif defined(HAVE_BDEV_OPEN_BY_PATH)
+ return (bdev_open_by_path(path, bmode, holder, NULL));
+#elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
+ return (blkdev_get_by_path(path, bmode, holder, NULL));
+#else
+ return (blkdev_get_by_path(path, bmode, holder));
+#endif
+}
+
+static void
+vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
+{
+#if defined(HAVE_BDEV_RELEASE)
+ return (bdev_release(bdh));
+#elif defined(HAVE_BLKDEV_PUT_HOLDER)
+ return (blkdev_put(BDH_BDEV(bdh), holder));
+#elif defined(HAVE_BLKDEV_PUT)
+ return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
+#else
+ fput(bdh);
+#endif
+}
+
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
- struct block_device *bdev;
- fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+ zfs_bdev_handle_t *bdh;
+ spa_mode_t smode = spa_mode(v->vdev_spa);
hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
vdev_disk_t *vd;
@@ -199,12 +307,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
boolean_t reread_part = B_FALSE;
rw_enter(&vd->vd_lock, RW_WRITER);
- bdev = vd->vd_bdev;
- vd->vd_bdev = NULL;
+ bdh = vd->vd_bdh;
+ vd->vd_bdh = NULL;
- if (bdev) {
+ if (bdh) {
+ struct block_device *bdev = BDH_BDEV(bdh);
if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
- bdevname(bdev_whole(bdev), disk_name + 5);
+ vdev_bdevname(bdev_whole(bdev), disk_name + 5);
/*
* If userland has BLKPG_RESIZE_PARTITION,
* then it should have updated the partition
@@ -224,15 +333,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
reread_part = B_TRUE;
}
- blkdev_put(bdev, mode | FMODE_EXCL);
+ vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
}
if (reread_part) {
- bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL,
+ bdh = vdev_blkdev_get_by_path(disk_name, smode,
zfs_vdev_holder);
- if (!IS_ERR(bdev)) {
- int error = vdev_bdev_reread_part(bdev);
- blkdev_put(bdev, mode | FMODE_EXCL);
+ if (!BDH_IS_ERR(bdh)) {
+ int error =
+ vdev_bdev_reread_part(BDH_BDEV(bdh));
+ vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
if (error == 0) {
timeout = MSEC2NSEC(
zfs_vdev_open_timeout_ms * 2);
@@ -275,58 +385,67 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
* subsequent attempts are expected to eventually succeed.
*/
hrtime_t start = gethrtime();
- bdev = ERR_PTR(-ENXIO);
- while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) {
- bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
+ bdh = BDH_ERR_PTR(-ENXIO);
+ while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
+ bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
zfs_vdev_holder);
- if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+ if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
+ /*
+ * There is no point of waiting since device is removed
+ * explicitly
+ */
+ if (v->vdev_removed)
+ break;
+
schedule_timeout(MSEC_TO_TICK(10));
- } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) {
+ } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) {
timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
continue;
- } else if (IS_ERR(bdev)) {
+ } else if (BDH_IS_ERR(bdh)) {
break;
}
}
- if (IS_ERR(bdev)) {
- int error = -PTR_ERR(bdev);
+ if (BDH_IS_ERR(bdh)) {
+ int error = -BDH_PTR_ERR(bdh);
vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
(u_longlong_t)(gethrtime() - start),
(u_longlong_t)timeout);
- vd->vd_bdev = NULL;
+ vd->vd_bdh = NULL;
v->vdev_tsd = vd;
rw_exit(&vd->vd_lock);
return (SET_ERROR(error));
} else {
- vd->vd_bdev = bdev;
+ vd->vd_bdh = bdh;
v->vdev_tsd = vd;
rw_exit(&vd->vd_lock);
}
+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+
/* Determine the physical block size */
- int physical_block_size = bdev_physical_block_size(vd->vd_bdev);
+ int physical_block_size = bdev_physical_block_size(bdev);
/* Determine the logical block size */
- int logical_block_size = bdev_logical_block_size(vd->vd_bdev);
+ int logical_block_size = bdev_logical_block_size(bdev);
/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
v->vdev_nowritecache = B_FALSE;
/* Set when device reports it supports TRIM. */
- v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev);
+ v->vdev_has_trim = bdev_discard_supported(bdev);
/* Set when device reports it supports secure TRIM. */
- v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev);
+ v->vdev_has_securetrim = bdev_secure_discard_supported(bdev);
/* Inform the ZIO pipeline that we are non-rotational */
- v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
+ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
/* Physical volume size in bytes for the partition */
- *psize = bdev_capacity(vd->vd_bdev);
+ *psize = bdev_capacity(bdev);
/* Physical volume size in bytes including possible expansion space */
- *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
+ *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk);
/* Based on the minimum sector size set the block size */
*physical_ashift = highbit64(MAX(physical_block_size,
@@ -346,98 +465,15 @@ vdev_disk_close(vdev_t *v)
if (v->vdev_reopening || vd == NULL)
return;
- if (vd->vd_bdev != NULL) {
- blkdev_put(vd->vd_bdev,
- vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
- }
+ if (vd->vd_bdh != NULL)
+ vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
+ zfs_vdev_holder);
rw_destroy(&vd->vd_lock);
kmem_free(vd, sizeof (vdev_disk_t));
v->vdev_tsd = NULL;
}
-static dio_request_t *
-vdev_disk_dio_alloc(int bio_count)
-{
- dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
- sizeof (struct bio *) * bio_count, KM_SLEEP);
- atomic_set(&dr->dr_ref, 0);
- dr->dr_bio_count = bio_count;
- dr->dr_error = 0;
-
- for (int i = 0; i < dr->dr_bio_count; i++)
- dr->dr_bio[i] = NULL;
-
- return (dr);
-}
-
-static void
-vdev_disk_dio_free(dio_request_t *dr)
-{
- int i;
-
- for (i = 0; i < dr->dr_bio_count; i++)
- if (dr->dr_bio[i])
- bio_put(dr->dr_bio[i]);
-
- kmem_free(dr, sizeof (dio_request_t) +
- sizeof (struct bio *) * dr->dr_bio_count);
-}
-
-static void
-vdev_disk_dio_get(dio_request_t *dr)
-{
- atomic_inc(&dr->dr_ref);
-}
-
-static int
-vdev_disk_dio_put(dio_request_t *dr)
-{
- int rc = atomic_dec_return(&dr->dr_ref);
-
- /*
- * Free the dio_request when the last reference is dropped and
- * ensure zio_interpret is called only once with the correct zio
- */
- if (rc == 0) {
- zio_t *zio = dr->dr_zio;
- int error = dr->dr_error;
-
- vdev_disk_dio_free(dr);
-
- if (zio) {
- zio->io_error = error;
- ASSERT3S(zio->io_error, >=, 0);
- if (zio->io_error)
- vdev_disk_error(zio);
-
- zio_delay_interrupt(zio);
- }
- }
-
- return (rc);
-}
-
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
-{
- dio_request_t *dr = bio->bi_private;
- int rc;
-
- if (dr->dr_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
- dr->dr_error = BIO_END_IO_ERROR(bio);
-#else
- if (error)
- dr->dr_error = -(error);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- dr->dr_error = EIO;
-#endif
- }
-
- /* Drop reference acquired by __vdev_disk_physio */
- rc = vdev_disk_dio_put(dr);
-}
-
static inline void
vdev_submit_bio_impl(struct bio *bio)
{
@@ -589,8 +625,467 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
return (bio);
}
+static inline uint_t
+vdev_bio_max_segs(struct block_device *bdev)
+{
+ /*
+ * Smallest of the device max segs and the tuneable max segs. Minimum
+ * 4, so there's room to finish split pages if they come up.
+ */
+ const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
+ const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
+ MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
+ const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
+
+#ifdef HAVE_BIO_MAX_SEGS
+ return (bio_max_segs(max_segs));
+#else
+ return (MIN(max_segs, BIO_MAX_PAGES));
+#endif
+}
+
+static inline uint_t
+vdev_bio_max_bytes(struct block_device *bdev)
+{
+ return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
+}
+
+
+/*
+ * Virtual block IO object (VBIO)
+ *
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
+ * they can hold. Depending on how they're allocated and structured, a large
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
+ * all have to complete before we can return the completed ZIO back to ZFS.
+ *
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
+ * translate a ZIO down into the kernel block layer and back again.
+ *
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
+ * (flush/trim) don't need multiple BIOs and so can just make the call
+ * directly.
+ */
+typedef struct {
+ zio_t *vbio_zio; /* parent zio */
+
+ struct block_device *vbio_bdev; /* blockdev to submit bios to */
+
+ abd_t *vbio_abd; /* abd carrying borrowed linear buf */
+
+ uint_t vbio_max_segs; /* max segs per bio */
+
+ uint_t vbio_max_bytes; /* max bytes per bio */
+ uint_t vbio_lbs_mask; /* logical block size mask */
+
+ uint64_t vbio_offset; /* start offset of next bio */
+
+ struct bio *vbio_bio; /* pointer to the current bio */
+ int vbio_flags; /* bio flags */
+} vbio_t;
+
+static vbio_t *
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
+{
+ vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
+
+ vbio->vbio_zio = zio;
+ vbio->vbio_bdev = bdev;
+ vbio->vbio_abd = NULL;
+ vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
+ vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
+ vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
+ vbio->vbio_offset = zio->io_offset;
+ vbio->vbio_bio = NULL;
+ vbio->vbio_flags = flags;
+
+ return (vbio);
+}
+
+BIO_END_IO_PROTO(vbio_completion, bio, error);
+
+static int
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+{
+ struct bio *bio = vbio->vbio_bio;
+ uint_t ssize;
+
+ while (size > 0) {
+ if (bio == NULL) {
+ /* New BIO, allocate and set up */
+ bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
+ vbio->vbio_max_segs);
+ VERIFY(bio);
+
+ BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+ bio_set_op_attrs(bio,
+ vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+ WRITE : READ, vbio->vbio_flags);
+
+ if (vbio->vbio_bio) {
+ bio_chain(vbio->vbio_bio, bio);
+ vdev_submit_bio(vbio->vbio_bio);
+ }
+ vbio->vbio_bio = bio;
+ }
+
+ /*
+ * Only load as much of the current page data as will fit in
+ * the space left in the BIO, respecting lbs alignment. Older
+ * kernels will error if we try to overfill the BIO, while
+ * newer ones will accept it and split the BIO. This ensures
+ * everything works on older kernels, and avoids an additional
+ * overhead on the new.
+ */
+ ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
+ vbio->vbio_lbs_mask);
+ if (ssize > 0 &&
+ bio_add_page(bio, page, ssize, offset) == ssize) {
+ /* Accepted, adjust and load any remaining. */
+ size -= ssize;
+ offset += ssize;
+ continue;
+ }
+
+ /* No room, set up for a new BIO and loop */
+ vbio->vbio_offset += BIO_BI_SIZE(bio);
+
+ /* Signal new BIO allocation wanted */
+ bio = NULL;
+ }
+
+ return (0);
+}
+
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+ vbio_t *vbio = priv;
+ return (vbio_add_page(vbio, page, len, off));
+}
+
+/* Create some BIOs, fill them with data and submit them */
+static void
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
+{
+ /*
+ * We plug so we can submit the BIOs as we go and only unplug them when
+ * they are fully created and submitted. This is important; if we don't
+ * plug, then the kernel may start executing earlier BIOs while we're
+ * still creating and executing later ones, and if the device goes
+ * away while that's happening, older kernels can get confused and
+ * trample memory.
+ */
+ struct blk_plug plug;
+ blk_start_plug(&plug);
+
+ (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
+ ASSERT(vbio->vbio_bio);
+
+ vbio->vbio_bio->bi_end_io = vbio_completion;
+ vbio->vbio_bio->bi_private = vbio;
+
+ /*
+ * Once submitted, vbio_bio now owns vbio (through bi_private) and we
+ * can't touch it again. The bio may complete and vbio_completion() be
+ * called and free the vbio before this task is run again, so we must
+ * consider it invalid from this point.
+ */
+ vdev_submit_bio(vbio->vbio_bio);
+
+ blk_finish_plug(&plug);
+}
+
+/* IO completion callback */
+BIO_END_IO_PROTO(vbio_completion, bio, error)
+{
+ vbio_t *vbio = bio->bi_private;
+ zio_t *zio = vbio->vbio_zio;
+
+ ASSERT(zio);
+
+ /* Capture and log any errors */
+#ifdef HAVE_1ARG_BIO_END_IO_T
+ zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+ zio->io_error = 0;
+ if (error)
+ zio->io_error = -(error);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ zio->io_error = EIO;
+#endif
+ ASSERT3U(zio->io_error, >=, 0);
+
+ if (zio->io_error)
+ vdev_disk_error(zio);
+
+ /* Return the BIO to the kernel */
+ bio_put(bio);
+
+ /*
+ * If we copied the ABD before issuing it, clean up and return the copy
+ * to the ADB, with changes if appropriate.
+ */
+ if (vbio->vbio_abd != NULL) {
+ void *buf = abd_to_buf(vbio->vbio_abd);
+ abd_free(vbio->vbio_abd);
+ vbio->vbio_abd = NULL;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+ else
+ abd_return_buf(zio->io_abd, buf, zio->io_size);
+ }
+
+ /* Final cleanup */
+ kmem_free(vbio, sizeof (vbio_t));
+
+ /* All done, submit for processing */
+ zio_delay_interrupt(zio);
+}
+
+/*
+ * Iterator callback to count ABD pages and check their size & alignment.
+ *
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
+ * the data within that page. A page can be arbitrarily large ("compound"
+ * pages) but we still have to ensure the data portion is correctly sized and
+ * aligned to the logical block size, to ensure that if the kernel wants to
+ * split the BIO, the two halves will still be properly aligned.
+ *
+ * NOTE: if you change this function, change the copy in
+ * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
+ * data there to validate the change you're making.
+ *
+ */
+typedef struct {
+ uint_t bmask;
+ uint_t npages;
+ uint_t end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+ (void) page;
+ vdev_disk_check_pages_t *s = priv;
+
+ /*
+ * If we didn't finish on a block size boundary last time, then there
+ * would be a gap if we tried to use this ABD as-is, so abort.
+ */
+ if (s->end != 0)
+ return (1);
+
+ /*
+ * Note if we're taking less than a full block, so we can check it
+ * above on the next call.
+ */
+ s->end = (off+len) & s->bmask;
+
+ /* All blocks after the first must start on a block size boundary. */
+ if (s->npages != 0 && (off & s->bmask) != 0)
+ return (1);
+
+ s->npages++;
+ return (0);
+}
+
+/*
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
+ * the number of pages, or 0 if it can't be submitted like this.
+ */
+static boolean_t
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+{
+ vdev_disk_check_pages_t s = {
+ .bmask = bdev_logical_block_size(bdev)-1,
+ .npages = 0,
+ .end = 0,
+ };
+
+ if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+static int
+vdev_disk_io_rw(zio_t *zio)
+{
+ vdev_t *v = zio->io_vd;
+ vdev_disk_t *vd = v->vdev_tsd;
+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+ int flags = 0;
+
+ /*
+ * Accessing outside the block device is never allowed.
+ */
+ if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
+ vdev_dbgmsg(zio->io_vd,
+ "Illegal access %llu size %llu, device size %llu",
+ (u_longlong_t)zio->io_offset,
+ (u_longlong_t)zio->io_size,
+ (u_longlong_t)i_size_read(bdev->bd_inode));
+ return (SET_ERROR(EIO));
+ }
+
+ if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+ v->vdev_failfast == B_TRUE) {
+ bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
+ zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
+ }
+
+ /*
+ * Check alignment of the incoming ABD. If any part of it would require
+ * submitting a page that is not aligned to the logical block size,
+ * then we take a copy into a linear buffer and submit that instead.
+ * This should be impossible on a 512b LBS, and fairly rare on 4K,
+ * usually requiring abnormally-small data blocks (eg gang blocks)
+ * mixed into the same ABD as larger ones (eg aggregated).
+ */
+ abd_t *abd = zio->io_abd;
+ if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
+ void *buf;
+ if (zio->io_type == ZIO_TYPE_READ)
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+ else
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+ /*
+ * Wrap the copy in an abd_t, so we can use the same iterators
+ * to count and fill the vbio later.
+ */
+ abd = abd_get_from_buf(buf, zio->io_size);
+
+ /*
+ * False here would mean the borrowed copy has an invalid
+ * alignment too, which would mean we've somehow been passed a
+ * linear ABD with an interior page that has a non-zero offset
+ * or a size not a multiple of PAGE_SIZE. This is not possible.
+ * It would mean either zio_buf_alloc() or its underlying
+ * allocators have done something extremely strange, or our
+ * math in vdev_disk_check_pages() is wrong. In either case,
+ * something in seriously wrong and its not safe to continue.
+ */
+ VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
+ }
+
+ /* Allocate vbio, with a pointer to the borrowed ABD if necessary */
+ vbio_t *vbio = vbio_alloc(zio, bdev, flags);
+ if (abd != zio->io_abd)
+ vbio->vbio_abd = abd;
+
+ /* Fill it with data pages and submit it to the kernel */
+ vbio_submit(vbio, abd, zio->io_size);
+ return (0);
+}
+
+/* ========== */
+
+/*
+ * This is the classic, battle-tested BIO submission code. Until we're totally
+ * sure that the new code is safe and correct in all cases, this will remain
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
+ * load time.
+ *
+ * These functions have been renamed to vdev_classic_* to make it clear what
+ * they belong to, but their implementations are unchanged.
+ */
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+ zio_t *dr_zio; /* Parent ZIO */
+ atomic_t dr_ref; /* References */
+ int dr_error; /* Bio error */
+ int dr_bio_count; /* Count of bio's */
+ struct bio *dr_bio[]; /* Attached bio's */
+} dio_request_t;
+
+static dio_request_t *
+vdev_classic_dio_alloc(int bio_count)
+{
+ dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+ sizeof (struct bio *) * bio_count, KM_SLEEP);
+ atomic_set(&dr->dr_ref, 0);
+ dr->dr_bio_count = bio_count;
+ dr->dr_error = 0;
+
+ for (int i = 0; i < dr->dr_bio_count; i++)
+ dr->dr_bio[i] = NULL;
+
+ return (dr);
+}
+
+static void
+vdev_classic_dio_free(dio_request_t *dr)
+{
+ int i;
+
+ for (i = 0; i < dr->dr_bio_count; i++)
+ if (dr->dr_bio[i])
+ bio_put(dr->dr_bio[i]);
+
+ kmem_free(dr, sizeof (dio_request_t) +
+ sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_classic_dio_get(dio_request_t *dr)
+{
+ atomic_inc(&dr->dr_ref);
+}
+
+static void
+vdev_classic_dio_put(dio_request_t *dr)
+{
+ int rc = atomic_dec_return(&dr->dr_ref);
+
+ /*
+ * Free the dio_request when the last reference is dropped and
+ * ensure zio_interpret is called only once with the correct zio
+ */
+ if (rc == 0) {
+ zio_t *zio = dr->dr_zio;
+ int error = dr->dr_error;
+
+ vdev_classic_dio_free(dr);
+
+ if (zio) {
+ zio->io_error = error;
+ ASSERT3S(zio->io_error, >=, 0);
+ if (zio->io_error)
+ vdev_disk_error(zio);
+
+ zio_delay_interrupt(zio);
+ }
+ }
+}
+
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
+{
+ dio_request_t *dr = bio->bi_private;
+
+ if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+ dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+ if (error)
+ dr->dr_error = -(error);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ dr->dr_error = EIO;
+#endif
+ }
+
+ /* Drop reference acquired by vdev_classic_physio */
+ vdev_classic_dio_put(dr);
+}
+
static inline unsigned int
-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
{
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
bio_size, abd_offset);
@@ -603,9 +1098,16 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
}
static int
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
- size_t io_size, uint64_t io_offset, int rw, int flags)
+vdev_classic_physio(zio_t *zio)
{
+ vdev_t *v = zio->io_vd;
+ vdev_disk_t *vd = v->vdev_tsd;
+ struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+ size_t io_size = zio->io_size;
+ uint64_t io_offset = zio->io_offset;
+ int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
+ int flags = 0;
+
dio_request_t *dr;
uint64_t abd_offset;
uint64_t bio_offset;
@@ -628,10 +1130,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
}
retry:
- dr = vdev_disk_dio_alloc(bio_count);
+ dr = vdev_classic_dio_alloc(bio_count);
- if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
- bio_set_flags_failfast(bdev, &flags);
+ if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+ zio->io_vd->vdev_failfast == B_TRUE) {
+ bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
+ zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
+ }
dr->dr_zio = zio;
@@ -660,23 +1165,23 @@ retry:
* this should be rare - see the comment above.
*/
if (dr->dr_bio_count == i) {
- vdev_disk_dio_free(dr);
+ vdev_classic_dio_free(dr);
bio_count *= 2;
goto retry;
}
- nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
+ nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
if (unlikely(dr->dr_bio[i] == NULL)) {
- vdev_disk_dio_free(dr);
+ vdev_classic_dio_free(dr);
return (SET_ERROR(ENOMEM));
}
- /* Matching put called by vdev_disk_physio_completion */
- vdev_disk_dio_get(dr);
+ /* Matching put called by vdev_classic_physio_completion */
+ vdev_classic_dio_get(dr);
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
- dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+ dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
dr->dr_bio[i]->bi_private = dr;
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
@@ -690,7 +1195,7 @@ retry:
}
/* Extra reference to protect dio_request during vdev_submit_bio */
- vdev_disk_dio_get(dr);
+ vdev_classic_dio_get(dr);
if (dr->dr_bio_count > 1)
blk_start_plug(&plug);
@@ -704,11 +1209,13 @@ retry:
if (dr->dr_bio_count > 1)
blk_finish_plug(&plug);
- (void) vdev_disk_dio_put(dr);
+ vdev_classic_dio_put(dr);
return (error);
}
+/* ========== */
+
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
{
zio_t *zio = bio->bi_private;
@@ -751,39 +1258,123 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
return (0);
}
+BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
+{
+ zio_t *zio = bio->bi_private;
+#ifdef HAVE_1ARG_BIO_END_IO_T
+ zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+ zio->io_error = -error;
+#endif
+ bio_put(bio);
+ if (zio->io_error)
+ vdev_disk_error(zio);
+ zio_interrupt(zio);
+}
+
+/*
+ * Wrappers for the different secure erase and discard APIs. We use async
+ * when available; in this case, *biop is set to the last bio in the chain.
+ */
static int
-vdev_disk_io_trim(zio_t *zio)
+vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector,
+ sector_t nsect, struct bio **biop)
{
- vdev_t *v = zio->io_vd;
- vdev_disk_t *vd = v->vdev_tsd;
+ *biop = NULL;
+ int error;
#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
- if (zio->io_trim_flags & ZIO_TRIM_SECURE) {
- return (-blkdev_issue_secure_erase(vd->vd_bdev,
- zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
- } else {
- return (-blkdev_issue_discard(vd->vd_bdev,
- zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS));
- }
-#elif defined(HAVE_BLKDEV_ISSUE_DISCARD)
- unsigned long trim_flags = 0;
-#if defined(BLKDEV_DISCARD_SECURE)
- if (zio->io_trim_flags & ZIO_TRIM_SECURE)
- trim_flags |= BLKDEV_DISCARD_SECURE;
+ error = blkdev_issue_secure_erase(BDH_BDEV(bdh),
+ sector, nsect, GFP_NOFS);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+ error = __blkdev_issue_discard(BDH_BDEV(bdh),
+ sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+ error = blkdev_issue_discard(BDH_BDEV(bdh),
+ sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE);
+#else
+#error "unsupported kernel"
#endif
- return (-blkdev_issue_discard(vd->vd_bdev,
- zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags));
+
+ return (error);
+}
+
+static int
+vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector,
+ sector_t nsect, struct bio **biop)
+{
+ *biop = NULL;
+ int error;
+
+#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
+ error = __blkdev_issue_discard(BDH_BDEV(bdh),
+ sector, nsect, GFP_NOFS, 0, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS)
+ error = __blkdev_issue_discard(BDH_BDEV(bdh),
+ sector, nsect, GFP_NOFS, biop);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
+ error = blkdev_issue_discard(BDH_BDEV(bdh),
+ sector, nsect, GFP_NOFS, 0);
+#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS)
+ error = blkdev_issue_discard(BDH_BDEV(bdh),
+ sector, nsect, GFP_NOFS);
#else
-#error "Unsupported kernel"
+#error "unsupported kernel"
#endif
+
+ return (error);
}
+/*
+ * Entry point for TRIM ops. This calls the right wrapper for secure erase or
+ * discard, and then does the appropriate finishing work for error vs success
+ * and async vs sync.
+ */
+static int
+vdev_disk_io_trim(zio_t *zio)
+{
+ int error;
+ struct bio *bio;
+
+ zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
+ sector_t sector = zio->io_offset >> 9;
+ sector_t nsects = zio->io_size >> 9;
+
+ if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+ error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
+ else
+ error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio);
+
+ if (error != 0)
+ return (SET_ERROR(-error));
+
+ if (bio == NULL) {
+ /*
+ * This was a synchronous op that completed successfully, so
+ * return it to ZFS immediately.
+ */
+ zio_interrupt(zio);
+ } else {
+ /*
+ * This was an asynchronous op; set up completion callback and
+ * issue it.
+ */
+ bio->bi_private = zio;
+ bio->bi_end_io = vdev_disk_discard_end_io;
+ vdev_submit_bio(bio);
+ }
+
+ return (0);
+}
+
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
+
static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
- int rw, error;
+ int error;
/*
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@@ -801,7 +1392,7 @@ vdev_disk_io_start(zio_t *zio)
* If the vdev is closed, it's likely due to a failed reopen and is
* in the UNAVAIL state. Nothing to be done here but return failure.
*/
- if (vd->vd_bdev == NULL) {
+ if (vd->vd_bdh == NULL) {
rw_exit(&vd->vd_lock);
zio->io_error = ENXIO;
zio_interrupt(zio);
@@ -809,74 +1400,72 @@ vdev_disk_io_start(zio_t *zio)
}
switch (zio->io_type) {
- case ZIO_TYPE_IOCTL:
+ case ZIO_TYPE_FLUSH:
if (!vdev_readable(v)) {
- rw_exit(&vd->vd_lock);
- zio->io_error = SET_ERROR(ENXIO);
- zio_interrupt(zio);
- return;
- }
-
- switch (zio->io_cmd) {
- case DKIOCFLUSHWRITECACHE:
-
- if (zfs_nocacheflush)
- break;
-
- if (v->vdev_nowritecache) {
- zio->io_error = SET_ERROR(ENOTSUP);
- break;
- }
-
- error = vdev_disk_io_flush(vd->vd_bdev, zio);
+ /* Drive not there, can't flush */
+ error = SET_ERROR(ENXIO);
+ } else if (zfs_nocacheflush) {
+ /* Flushing disabled by operator, declare success */
+ error = 0;
+ } else if (v->vdev_nowritecache) {
+ /* This vdev not capable of flushing */
+ error = SET_ERROR(ENOTSUP);
+ } else {
+ /*
+ * Issue the flush. If successful, the response will
+ * be handled in the completion callback, so we're done.
+ */
+ error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio);
if (error == 0) {
rw_exit(&vd->vd_lock);
return;
}
-
- zio->io_error = error;
-
- break;
-
- default:
- zio->io_error = SET_ERROR(ENOTSUP);
}
+ /* Couldn't issue the flush, so set the error and return it */
rw_exit(&vd->vd_lock);
+ zio->io_error = error;
zio_execute(zio);
return;
- case ZIO_TYPE_WRITE:
- rw = WRITE;
- break;
-
- case ZIO_TYPE_READ:
- rw = READ;
- break;
case ZIO_TYPE_TRIM:
- zio->io_error = vdev_disk_io_trim(zio);
+ error = vdev_disk_io_trim(zio);
rw_exit(&vd->vd_lock);
- zio_interrupt(zio);
+ if (error) {
+ zio->io_error = error;
+ zio_execute(zio);
+ }
return;
- default:
+ case ZIO_TYPE_READ:
+ case ZIO_TYPE_WRITE:
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+ error = vdev_disk_io_rw_fn(zio);
rw_exit(&vd->vd_lock);
- zio->io_error = SET_ERROR(ENOTSUP);
- zio_interrupt(zio);
+ if (error) {
+ zio->io_error = error;
+ zio_interrupt(zio);
+ }
return;
- }
- zio->io_target_timestamp = zio_handle_io_delay(zio);
- error = __vdev_disk_physio(vd->vd_bdev, zio,
- zio->io_size, zio->io_offset, rw, 0);
- rw_exit(&vd->vd_lock);
+ default:
+ /*
+ * Getting here means our parent vdev has made a very strange
+ * request of us, and shouldn't happen. Assert here to force a
+ * crash in dev builds, but in production return the IO
+ * unhandled. The pool will likely suspend anyway but that's
+ * nicer than crashing the kernel.
+ */
+ ASSERT3S(zio->io_type, ==, -1);
- if (error) {
- zio->io_error = error;
+ rw_exit(&vd->vd_lock);
+ zio->io_error = SET_ERROR(ENOTSUP);
zio_interrupt(zio);
return;
}
+
+ __builtin_unreachable();
}
static void
@@ -891,8 +1480,8 @@ vdev_disk_io_done(zio_t *zio)
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
- if (zfs_check_media_change(vd->vd_bdev)) {
- invalidate_bdev(vd->vd_bdev);
+ if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) {
+ invalidate_bdev(BDH_BDEV(vd->vd_bdh));
v->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
}
@@ -925,8 +1514,49 @@ vdev_disk_rele(vdev_t *vd)
/* XXX: Implement me as a vnode rele for the device */
}
+/*
+ * BIO submission method. See comment above about vdev_classic.
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
+ */
+static uint_t zfs_vdev_disk_classic = 0; /* default new */
+
+/* Set submission function from module parameter */
+static int
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
+{
+ int err = param_set_uint(buf, kp);
+ if (err < 0)
+ return (SET_ERROR(err));
+
+ vdev_disk_io_rw_fn =
+ zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
+
+ printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
+ zfs_vdev_disk_classic ? "classic" : "new");
+
+ return (0);
+}
+
+/*
+ * At first use vdev use, set the submission function from the default value if
+ * it hasn't been set already.
+ */
+static int
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+ (void) spa;
+ (void) nv;
+ (void) tsd;
+
+ if (vdev_disk_io_rw_fn == NULL)
+ vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
+ vdev_classic_physio : vdev_disk_io_rw;
+
+ return (0);
+}
+
vdev_ops_t vdev_disk_ops = {
- .vdev_op_init = NULL,
+ .vdev_op_init = vdev_disk_init,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
@@ -947,7 +1577,8 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
- .vdev_op_leaf = B_TRUE /* leaf vdev */
+ .vdev_op_leaf = B_TRUE, /* leaf vdev */
+ .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post
};
/*
@@ -976,17 +1607,17 @@ MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
int
param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
{
- uint64_t val;
+ uint_t val;
int error;
- error = kstrtoull(buf, 0, &val);
+ error = kstrtouint(buf, 0, &val);
if (error < 0)
return (SET_ERROR(error));
if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
return (SET_ERROR(-EINVAL));
- error = param_set_ulong(buf, kp);
+ error = param_set_uint(buf, kp);
if (error < 0)
return (SET_ERROR(error));
@@ -996,19 +1627,32 @@ param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
int
param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
{
- uint64_t val;
+ uint_t val;
int error;
- error = kstrtoull(buf, 0, &val);
+ error = kstrtouint(buf, 0, &val);
if (error < 0)
return (SET_ERROR(error));
if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
return (SET_ERROR(-EINVAL));
- error = param_set_ulong(buf, kp);
+ error = param_set_uint(buf, kp);
if (error < 0)
return (SET_ERROR(error));
return (0);
}
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
+ "Timeout before determining that a device is missing");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
+ "Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
+
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
+ "Maximum number of data segments to add to an IO request (min 4)");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
+ vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
+ "Use classic BIO submission method");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
index f073145326e3..ac41a2615f16 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -53,8 +53,8 @@ static taskq_t *vdev_file_taskq;
* impact the vdev_ashift setting which can only be set at vdev creation
* time.
*/
-static unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
-static unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
+static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
+static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
static void
vdev_file_hold(vdev_t *vd)
@@ -242,7 +242,7 @@ vdev_file_io_start(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_file_t *vf = vd->vdev_tsd;
- if (zio->io_type == ZIO_TYPE_IOCTL) {
+ if (zio->io_type == ZIO_TYPE_FLUSH) {
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
@@ -250,33 +250,27 @@ vdev_file_io_start(zio_t *zio)
return;
}
- switch (zio->io_cmd) {
- case DKIOCFLUSHWRITECACHE:
-
- if (zfs_nocacheflush)
- break;
-
- /*
- * We cannot safely call vfs_fsync() when PF_FSTRANS
- * is set in the current context. Filesystems like
- * XFS include sanity checks to verify it is not
- * already set, see xfs_vm_writepage(). Therefore
- * the sync must be dispatched to a different context.
- */
- if (__spl_pf_fstrans_check()) {
- VERIFY3U(taskq_dispatch(vdev_file_taskq,
- vdev_file_io_fsync, zio, TQ_SLEEP), !=,
- TASKQID_INVALID);
- return;
- }
-
- zio->io_error = zfs_file_fsync(vf->vf_file,
- O_SYNC | O_DSYNC);
- break;
- default:
- zio->io_error = SET_ERROR(ENOTSUP);
+ if (zfs_nocacheflush) {
+ zio_execute(zio);
+ return;
}
+ /*
+ * We cannot safely call vfs_fsync() when PF_FSTRANS
+ * is set in the current context. Filesystems like
+ * XFS include sanity checks to verify it is not
+ * already set, see xfs_vm_writepage(). Therefore
+ * the sync must be dispatched to a different context.
+ */
+ if (__spl_pf_fstrans_check()) {
+ VERIFY3U(taskq_dispatch(vdev_file_taskq,
+ vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+ TASKQID_INVALID);
+ return;
+ }
+
+ zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
+
zio_execute(zio);
return;
} else if (zio->io_type == ZIO_TYPE_TRIM) {
@@ -376,7 +370,7 @@ vdev_ops_t vdev_disk_ops = {
#endif
-ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW,
"Logical ashift for file-based devices");
-ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW,
"Physical ashift for file-based devices");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_label_os.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_label_os.c
new file mode 100644
index 000000000000..3d965b89a962
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_label_os.c
@@ -0,0 +1,45 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023 by iXsystems, Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+
+/*
+ * Check if the reserved boot area is in-use.
+ *
+ * This function always returns 0, as there are no known external uses
+ * of the reserved area on Linux.
+ */
+int
+vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd)
+{
+ (void) spa;
+ (void) childvd;
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
index b70691ab31c1..48abbc010917 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -493,10 +493,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp)
{
zfs_acl_node_t *aclnode;
- while ((aclnode = list_head(&aclp->z_acl))) {
- list_remove(&aclp->z_acl, aclnode);
+ while ((aclnode = list_remove_head(&aclp->z_acl)))
zfs_acl_node_free(aclnode);
- }
aclp->z_acl_count = 0;
aclp->z_acl_bytes = 0;
}
@@ -525,7 +523,7 @@ zfs_acl_valid_ace_type(uint_t type, uint_t flags)
entry_type == ACE_EVERYONE || entry_type == 0 ||
entry_type == ACE_IDENTIFIER_GROUP);
default:
- if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+ if (type <= MAX_ACE_TYPE)
return (B_TRUE);
}
return (B_FALSE);
@@ -629,18 +627,18 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
return (NULL);
}
-static uint64_t
-zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+static uintptr_t
+zfs_ace_walk(void *datap, uintptr_t cookie, int aclcnt,
uint16_t *flags, uint16_t *type, uint32_t *mask)
{
(void) aclcnt;
zfs_acl_t *aclp = datap;
- zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+ zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)cookie;
uint64_t who;
acep = zfs_acl_next_ace(aclp, acep, &who, mask,
flags, type);
- return ((uint64_t)(uintptr_t)acep);
+ return ((uintptr_t)acep);
}
/*
@@ -1163,6 +1161,7 @@ zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
cb->cb_acl_node);
}
+ ASSERT3P(cb->cb_acl_node, !=, NULL);
*dataptr = cb->cb_acl_node->z_acldata;
*length = cb->cb_acl_node->z_size;
}
@@ -1284,7 +1283,7 @@ acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
*/
static int
ace_trivial_common(void *acep, int aclcnt,
- uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uintptr_t (*walk)(void *, uintptr_t, int,
uint16_t *, uint16_t *, uint32_t *))
{
uint16_t flags;
@@ -1801,7 +1800,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp,
*/
int
zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
- vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zidmap_t *mnt_ns)
{
int error;
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
@@ -1888,8 +1887,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
acl_ids->z_mode |= S_ISGID;
} else {
if ((acl_ids->z_mode & S_ISGID) &&
- secpolicy_vnode_setids_setgids(cr, gid) != 0)
+ secpolicy_vnode_setids_setgids(cr, gid, mnt_ns,
+ zfs_i_user_ns(ZTOI(dzp))) != 0) {
acl_ids->z_mode &= ~S_ISGID;
+ }
}
if (acl_ids->z_aclp == NULL) {
@@ -1920,8 +1921,8 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
trim = B_TRUE;
- zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE,
- trim, acl_ids->z_aclp);
+ zfs_acl_chmod(S_ISDIR(vap->va_mode), acl_ids->z_mode,
+ B_FALSE, trim, acl_ids->z_aclp);
}
}
@@ -1977,7 +1978,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
if (mask == 0)
return (SET_ERROR(ENOSYS));
- if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+ if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr,
+ zfs_init_idmap)))
return (error);
mutex_enter(&zp->z_acl_lock);
@@ -2136,7 +2138,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
if (zp->z_pflags & ZFS_IMMUTABLE)
return (SET_ERROR(EPERM));
- if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+ if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
+ zfs_init_idmap)))
return (error);
error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp,
@@ -2228,8 +2231,7 @@ static int
zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
{
if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) &&
- (!Z_ISDEV(ZTOI(zp)->i_mode) ||
- (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) {
+ (!Z_ISDEV(ZTOI(zp)->i_mode) || (v4_mode & WRITE_MASK_ATTRS))) {
return (SET_ERROR(EROFS));
}
@@ -2282,7 +2284,7 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
*/
static int
zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
- boolean_t anyaccess, cred_t *cr)
+ boolean_t anyaccess, cred_t *cr, zidmap_t *mnt_ns)
{
zfsvfs_t *zfsvfs = ZTOZSB(zp);
zfs_acl_t *aclp;
@@ -2298,7 +2300,13 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
uid_t gowner;
uid_t fowner;
- zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+ if (mnt_ns) {
+ fowner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)),
+ KUID_TO_SUID(ZTOI(zp)->i_uid));
+ gowner = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ZTOI(zp)),
+ KGID_TO_SGID(ZTOI(zp)->i_gid));
+ } else
+ zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
mutex_enter(&zp->z_acl_lock);
@@ -2409,7 +2417,8 @@ zfs_has_access(znode_t *zp, cred_t *cr)
{
uint32_t have = ACE_ALL_PERMS;
- if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr,
+ zfs_init_idmap) != 0) {
uid_t owner;
owner = zfs_fuid_map_id(ZTOZSB(zp),
@@ -2439,7 +2448,8 @@ zfs_has_access(znode_t *zp, cred_t *cr)
* we want to avoid that here.
*/
static int
-zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr)
+zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr,
+ zidmap_t *mnt_ns)
{
int err, mask;
int unmapped = 0;
@@ -2452,8 +2462,9 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr)
return (unmapped ? SET_ERROR(EPERM) : 0);
}
-#if defined(HAVE_IOPS_PERMISSION_USERNS)
- err = generic_permission(cr->user_ns, ZTOI(zp), mask);
+#if (defined(HAVE_IOPS_PERMISSION_USERNS) || \
+ defined(HAVE_IOPS_PERMISSION_IDMAP))
+ err = generic_permission(mnt_ns, ZTOI(zp), mask);
#else
err = generic_permission(ZTOI(zp), mask);
#endif
@@ -2468,7 +2479,7 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr)
static int
zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
- boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr, zidmap_t *mnt_ns)
{
zfsvfs_t *zfsvfs = ZTOZSB(zp);
int err;
@@ -2518,20 +2529,20 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
}
if (zp->z_pflags & ZFS_ACL_TRIVIAL)
- return (zfs_zaccess_trivial(zp, working_mode, cr));
+ return (zfs_zaccess_trivial(zp, working_mode, cr, mnt_ns));
- return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr, mnt_ns));
}
static int
zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
- cred_t *cr)
+ cred_t *cr, zidmap_t *mnt_ns)
{
if (*working_mode != ACE_WRITE_DATA)
return (SET_ERROR(EACCES));
return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
- check_privs, B_FALSE, cr));
+ check_privs, B_FALSE, cr, mnt_ns));
}
int
@@ -2566,7 +2577,6 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
}
if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) {
- owner = B_TRUE;
if (zdp->z_mode & S_IXUSR) {
mutex_exit(&zdp->z_acl_lock);
return (0);
@@ -2576,7 +2586,6 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
}
}
if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) {
- groupmbr = B_TRUE;
if (zdp->z_mode & S_IXGRP) {
mutex_exit(&zdp->z_acl_lock);
return (0);
@@ -2596,9 +2605,11 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
slow:
DTRACE_PROBE(zfs__fastpath__execute__access__miss);
- ZFS_ENTER(ZTOZSB(zdp));
- error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
- ZFS_EXIT(ZTOZSB(zdp));
+ if ((error = zfs_enter(ZTOZSB(zdp), FTAG)) != 0)
+ return (error);
+ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
+ zfs_init_idmap);
+ zfs_exit(ZTOZSB(zdp), FTAG);
return (error);
}
@@ -2609,7 +2620,8 @@ slow:
* can define any form of access.
*/
int
-zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr,
+ zidmap_t *mnt_ns)
{
uint32_t working_mode;
int error;
@@ -2648,8 +2660,10 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
}
- owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid),
- cr, ZFS_OWNER);
+ owner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)),
+ KUID_TO_SUID(ZTOI(zp)->i_uid));
+ owner = zfs_fuid_map_id(ZTOZSB(zp), owner, cr, ZFS_OWNER);
+
/*
* Map the bits required to the standard inode flags
* S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits
@@ -2674,7 +2688,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
needed_bits |= S_IXUSR;
if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
- &check_privs, skipaclchk, cr)) == 0) {
+ &check_privs, skipaclchk, cr, mnt_ns)) == 0) {
if (is_attr)
zrele(xzp);
return (secpolicy_vnode_access2(cr, ZTOI(zp), owner,
@@ -2688,7 +2702,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
}
if (error && (flags & V_APPEND)) {
- error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+ error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr,
+ mnt_ns);
}
if (error && check_privs) {
@@ -2699,7 +2714,6 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
* read_acl/read_attributes
*/
- error = 0;
ASSERT(working_mode != 0);
if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
@@ -2755,20 +2769,22 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
* NFSv4-style ZFS ACL format and call zfs_zaccess()
*/
int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr,
+ zidmap_t *mnt_ns)
{
- return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+ return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr,
+ mnt_ns));
}
/*
* Access function for secpolicy_vnode_setattr
*/
int
-zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+zfs_zaccess_unix(void *zp, int mode, cred_t *cr)
{
int v4_mode = zfs_unix_to_v4(mode >> 6);
- return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+ return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, zfs_init_idmap));
}
/* See zfs_zaccess_delete() */
@@ -2845,7 +2861,7 @@ static const boolean_t zfs_write_implies_delete_child = B_TRUE;
* zfs_write_implies_delete_child
*/
int
-zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zidmap_t *mnt_ns)
{
uint32_t wanted_dirperms;
uint32_t dzp_working_mode = 0;
@@ -2872,7 +2888,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
* (This is part of why we're checking the target first.)
*/
zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
- &zpcheck_privs, B_FALSE, cr);
+ &zpcheck_privs, B_FALSE, cr, mnt_ns);
if (zp_error == EACCES) {
/* We hit a DENY ACE. */
if (!zpcheck_privs)
@@ -2894,7 +2910,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
if (zfs_write_implies_delete_child)
wanted_dirperms |= ACE_WRITE_DATA;
dzp_error = zfs_zaccess_common(dzp, wanted_dirperms,
- &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr, mnt_ns);
if (dzp_error == EACCES) {
/* We hit a DENY ACE. */
if (!dzpcheck_privs)
@@ -2976,7 +2992,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
int
zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
- znode_t *tzp, cred_t *cr)
+ znode_t *tzp, cred_t *cr, zidmap_t *mnt_ns)
{
int add_perm;
int error;
@@ -2998,21 +3014,21 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
* If that succeeds then check for add_file/add_subdir permissions
*/
- if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+ if ((error = zfs_zaccess_delete(sdzp, szp, cr, mnt_ns)))
return (error);
/*
* If we have a tzp, see if we can delete it?
*/
if (tzp) {
- if ((error = zfs_zaccess_delete(tdzp, tzp, cr)))
+ if ((error = zfs_zaccess_delete(tdzp, tzp, cr, mnt_ns)))
return (error);
}
/*
* Now check for add permissions
*/
- error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr, mnt_ns);
return (error);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index aae19f6346fd..54ed70d0394f 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -392,7 +392,20 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
zfsctl_snapshot_hold(se);
rw_enter(&se->se_taskqid_lock, RW_WRITER);
- ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
+ /*
+ * If this condition happens, we managed to:
+ * - dispatch once
+ * - want to dispatch _again_ before it returned
+ *
+ * So let's just return - if that task fails at unmounting,
+ * we'll eventually dispatch again, and if it succeeds,
+ * no problem.
+ */
+ if (se->se_taskqid != TASKQID_INVALID) {
+ rw_exit(&se->se_taskqid_lock);
+ zfsctl_snapshot_rele(se);
+ return;
+ }
se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
rw_exit(&se->se_taskqid_lock);
@@ -465,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip)
*/
static struct inode *
zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
- const struct file_operations *fops, const struct inode_operations *ops)
+ const struct file_operations *fops, const struct inode_operations *ops,
+ uint64_t creation)
{
- inode_timespec_t now;
struct inode *ip;
znode_t *zp;
+ inode_timespec_t now = {.tv_sec = creation};
ip = new_inode(zfsvfs->z_sb);
if (ip == NULL)
return (NULL);
- now = current_time(ip);
+ if (!creation)
+ now = current_time(ip);
zp = ITOZ(ip);
ASSERT3P(zp->z_dirlocks, ==, NULL);
ASSERT3P(zp->z_acl_cached, ==, NULL);
@@ -485,9 +500,10 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
zp->z_atime_dirty = B_FALSE;
zp->z_zn_prefetch = B_FALSE;
zp->z_is_sa = B_FALSE;
+#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
zp->z_is_mapped = B_FALSE;
+#endif
zp->z_is_ctldir = B_TRUE;
- zp->z_is_stale = B_FALSE;
zp->z_sa_hdl = NULL;
zp->z_blksz = 0;
zp->z_seq = 0;
@@ -504,9 +520,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
ip->i_uid = SUID_TO_KUID(0);
ip->i_gid = SGID_TO_KGID(0);
ip->i_blkbits = SPA_MINBLOCKSHIFT;
- ip->i_atime = now;
- ip->i_mtime = now;
- ip->i_ctime = now;
+ zpl_inode_set_atime_to_ts(ip, now);
+ zpl_inode_set_mtime_to_ts(ip, now);
+ zpl_inode_set_ctime_to_ts(ip, now);
ip->i_fop = fops;
ip->i_op = ops;
#if defined(IOP_XATTR)
@@ -521,7 +537,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
- zfsvfs->z_nr_znodes++;
membar_producer();
mutex_exit(&zfsvfs->z_znodes_lock);
@@ -538,14 +553,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
const struct file_operations *fops, const struct inode_operations *ops)
{
struct inode *ip = NULL;
+ uint64_t creation = 0;
+ dsl_dataset_t *snap_ds;
+ dsl_pool_t *pool;
while (ip == NULL) {
ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
if (ip)
break;
+ if (id <= ZFSCTL_INO_SNAPDIRS && !creation) {
+ pool = dmu_objset_pool(zfsvfs->z_os);
+ dsl_pool_config_enter(pool, FTAG);
+ if (!dsl_dataset_hold_obj(pool,
+ ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) {
+ creation = dsl_get_creation(snap_ds);
+ dsl_dataset_rele(snap_ds, FTAG);
+ }
+ dsl_pool_config_exit(pool, FTAG);
+ }
+
/* May fail due to concurrent zfsctl_inode_alloc() */
- ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+ ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation);
}
return (ip);
@@ -567,7 +596,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
ASSERT(zfsvfs->z_ctldir == NULL);
zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
- &zpl_fops_root, &zpl_ops_root);
+ &zpl_fops_root, &zpl_ops_root, 0);
if (zfsvfs->z_ctldir == NULL)
return (SET_ERROR(ENOENT));
@@ -673,17 +702,19 @@ zfsctl_fid(struct inode *ip, fid_t *fidp)
uint64_t object = zp->z_id;
zfid_short_t *zfid;
int i;
+ int error;
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
if (zfsctl_is_snapdir(ip)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (zfsctl_snapdir_fid(ip, fidp));
}
if (fidp->fid_len < SHORT_FID_LEN) {
fidp->fid_len = SHORT_FID_LEN;
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOSPC));
}
@@ -698,7 +729,7 @@ zfsctl_fid(struct inode *ip, fid_t *fidp)
for (i = 0; i < sizeof (zfid->zf_gen); i++)
zfid->zf_gen[i] = 0;
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -776,7 +807,8 @@ zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp,
zfsvfs_t *zfsvfs = ITOZSB(dip);
int error = 0;
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
if (strcmp(name, "..") == 0) {
*ipp = dip->i_sb->s_root->d_inode;
@@ -793,7 +825,7 @@ zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp,
if (*ipp == NULL)
error = SET_ERROR(ENOENT);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -810,11 +842,12 @@ zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp,
uint64_t id;
int error;
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
if (error) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -823,7 +856,7 @@ zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp,
if (*ipp == NULL)
error = SET_ERROR(ENOENT);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -844,7 +877,8 @@ zfsctl_snapdir_rename(struct inode *sdip, const char *snm,
if (!zfs_admin_snapshot)
return (SET_ERROR(EACCES));
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
@@ -902,7 +936,7 @@ out:
kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -922,7 +956,8 @@ zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr,
if (!zfs_admin_snapshot)
return (SET_ERROR(EACCES));
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
@@ -951,7 +986,7 @@ out:
kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1076,7 +1111,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
return (SET_ERROR(EISDIR));
zfsvfs = ITOZSB(ip);
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
@@ -1164,7 +1200,7 @@ error:
kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
kmem_free(full_path, MAXPATHLEN);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1228,10 +1264,11 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
znode_t *dzp;
int error;
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
if (zfsvfs->z_shares_dir == 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOTSUP));
}
@@ -1240,7 +1277,7 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
zrele(dzp);
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
index be65f0a2e245..f707959c9445 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -29,13 +29,13 @@
typedef struct zfs_dbgmsg {
procfs_list_node_t zdm_node;
uint64_t zdm_timestamp;
- int zdm_size;
- char zdm_msg[1]; /* variable length allocation */
+ uint_t zdm_size;
+ char zdm_msg[]; /* variable length allocation */
} zfs_dbgmsg_t;
static procfs_list_t zfs_dbgmsgs;
-static int zfs_dbgmsg_size = 0;
-int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+static uint_t zfs_dbgmsg_size = 0;
+static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
/*
* Internal ZFS debug messages are enabled by default.
@@ -68,14 +68,14 @@ zfs_dbgmsg_show(struct seq_file *f, void *p)
}
static void
-zfs_dbgmsg_purge(int max_size)
+zfs_dbgmsg_purge(uint_t max_size)
{
while (zfs_dbgmsg_size > max_size) {
zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
if (zdm == NULL)
return;
- int size = zdm->zdm_size;
+ uint_t size = zdm->zdm_size;
kmem_free(zdm, size);
zfs_dbgmsg_size -= size;
}
@@ -135,7 +135,7 @@ __set_error(const char *file, const char *func, int line, int err)
void
__zfs_dbgmsg(char *buf)
{
- int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+ uint_t size = sizeof (zfs_dbgmsg_t) + strlen(buf) + 1;
zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
zdm->zdm_size = size;
zdm->zdm_timestamp = gethrestime_sec();
@@ -144,7 +144,7 @@ __zfs_dbgmsg(char *buf)
mutex_enter(&zfs_dbgmsgs.pl_lock);
procfs_list_add(&zfs_dbgmsgs, zdm);
zfs_dbgmsg_size += size;
- zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+ zfs_dbgmsg_purge(zfs_dbgmsg_maxsize);
mutex_exit(&zfs_dbgmsgs.pl_lock);
}
@@ -175,7 +175,8 @@ __dprintf(boolean_t dprint, const char *file, const char *func,
newfile = file;
}
- i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func);
+ i = snprintf(buf, size, "%px %s%s:%d:%s(): ",
+ curthread, prefix, newfile, line, func);
if (i < size) {
va_start(adx, fmt);
@@ -252,6 +253,8 @@ zfs_dbgmsg_print(const char *tag)
module_param(zfs_dbgmsg_enable, int, 0644);
MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
-module_param(zfs_dbgmsg_maxsize, int, 0644);
+/* BEGIN CSTYLED */
+module_param(zfs_dbgmsg_maxsize, uint, 0644);
+/* END CSTYLED */
MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
index c5b3b5ce7fc0..1eeabe53d23c 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -649,6 +649,8 @@ zfs_rmnode(znode_t *zp)
objset_t *os = zfsvfs->z_os;
znode_t *xzp = NULL;
dmu_tx_t *tx;
+ znode_hold_t *zh;
+ uint64_t z_id = zp->z_id;
uint64_t acl_obj;
uint64_t xattr_obj;
uint64_t links;
@@ -666,8 +668,9 @@ zfs_rmnode(znode_t *zp)
* Not enough space to delete some xattrs.
* Leave it in the unlinked set.
*/
+ zh = zfs_znode_hold_enter(zfsvfs, z_id);
zfs_znode_dmu_fini(zp);
-
+ zfs_znode_hold_exit(zfsvfs, zh);
return;
}
}
@@ -686,7 +689,9 @@ zfs_rmnode(znode_t *zp)
* Not enough space or we were interrupted by unmount.
* Leave the file in the unlinked set.
*/
+ zh = zfs_znode_hold_enter(zfsvfs, z_id);
zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
return;
}
}
@@ -726,7 +731,9 @@ zfs_rmnode(znode_t *zp)
* which point we'll call zfs_unlinked_drain() to process it).
*/
dmu_tx_abort(tx);
+ zh = zfs_znode_hold_enter(zfsvfs, z_id);
zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
goto out;
}
@@ -926,6 +933,74 @@ zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
return (error);
}
+static int
+zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+ boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[3];
+ uint64_t mtime[2], ctime[2];
+ uint64_t links;
+ int count = 0;
+ int error;
+
+ if (zp_is_dir && !zfs_dirempty(zp))
+ return (SET_ERROR(ENOTEMPTY));
+
+ if (ZTOI(zp)->i_nlink <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on %lu is %u, "
+ "should be at least %u", zp->z_id,
+ (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
+ set_nlink(ZTOI(zp), zp_is_dir + 1);
+ }
+ drop_nlink(ZTOI(zp));
+ if (ZTOI(zp)->i_nlink == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ clear_nlink(ZTOI(zp));
+ unlinked = B_TRUE;
+ } else {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ links = ZTOI(zp)->i_nlink;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &links, sizeof (links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT3U(error, ==, 0);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Forcefully drop an nlink reference from (zp) and mark it for deletion if it
+ * was the last link. This *must* only be done to znodes which have already
+ * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in
+ * the error path of zfs_rename(), where we have to correct the nlink count if
+ * we failed to link the target as well as failing to re-link the original
+ * znodes.
+ */
+int
+zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
+{
+ int error;
+
+ mutex_enter(&zp->z_lock);
+ error = zfs_drop_nlink_locked(zp, tx, unlinkedp);
+ mutex_exit(&zp->z_lock);
+
+ return (error);
+}
+
/*
* Unlink zp from dl, and mark zp for deletion if this was the last link. Can
* fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
@@ -966,31 +1041,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
return (error);
}
- if (ZTOI(zp)->i_nlink <= zp_is_dir) {
- zfs_panic_recover("zfs: link count on %lu is %u, "
- "should be at least %u", zp->z_id,
- (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
- set_nlink(ZTOI(zp), zp_is_dir + 1);
- }
- drop_nlink(ZTOI(zp));
- if (ZTOI(zp)->i_nlink == zp_is_dir) {
- zp->z_unlinked = B_TRUE;
- clear_nlink(ZTOI(zp));
- unlinked = B_TRUE;
- } else {
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
- NULL, &ctime, sizeof (ctime));
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
- NULL, &zp->z_pflags, sizeof (zp->z_pflags));
- zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
- ctime);
- }
- links = ZTOI(zp)->i_nlink;
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
- NULL, &links, sizeof (links));
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- count = 0;
- ASSERT(error == 0);
+ /* The only error is !zfs_dirempty() and we checked earlier. */
+ error = zfs_drop_nlink_locked(zp, tx, &unlinked);
+ ASSERT3U(error, ==, 0);
mutex_exit(&zp->z_lock);
} else {
error = zfs_dropname(dl, zp, dzp, tx, flag);
@@ -1066,11 +1119,8 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
*xzpp = NULL;
- if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)))
- return (error);
-
if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
- &acl_ids)) != 0)
+ &acl_ids, zfs_init_idmap)) != 0)
return (error);
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
zfs_acl_ids_free(&acl_ids);
@@ -1218,7 +1268,8 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
cr, ZFS_OWNER);
if ((uid = crgetuid(cr)) == downer || uid == fowner ||
- zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)
+ zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
+ zfs_init_idmap) == 0)
return (0);
else
return (secpolicy_vnode_remove(cr));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index e12f7c3ced43..bc753614be27 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -246,7 +246,7 @@ zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
{
loff_t rc;
- if (*offp < 0 || *offp > MAXOFFSET_T)
+ if (*offp < 0)
return (EINVAL);
rc = vfs_llseek(fp, *offp, whence);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
index 67b864aa77a9..663474ea49ab 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -135,7 +135,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
vecnum = cmd - ZFS_IOC_FIRST;
- zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+ zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) {
error = -SET_ERROR(EFAULT);
@@ -146,7 +146,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
if (error == 0 && rc != 0)
error = -SET_ERROR(EFAULT);
out:
- kmem_free(zc, sizeof (zfs_cmd_t));
+ vmem_free(zc, sizeof (zfs_cmd_t));
return (error);
}
@@ -282,6 +282,8 @@ zfsdev_detach(void)
#define ZFS_DEBUG_STR ""
#endif
+zidmap_t *zfs_init_idmap;
+
static int
openzfs_init_os(void)
{
@@ -305,6 +307,8 @@ openzfs_init_os(void)
printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n");
#endif /* CONFIG_FS_POSIX_ACL */
+ zfs_init_idmap = (zidmap_t *)zfs_get_init_idmap();
+
return (0);
}
@@ -369,8 +373,7 @@ MODULE_ALIAS("zcommon");
MODULE_ALIAS("zzstd");
MODULE_DESCRIPTION("ZFS");
MODULE_AUTHOR(ZFS_META_AUTHOR);
-MODULE_LICENSE("Lua: MIT");
-MODULE_LICENSE("zstd: Dual BSD/GPL");
-MODULE_LICENSE("Dual BSD/GPL");
+MODULE_LICENSE("Dual MIT/GPL"); /* lua */
+MODULE_LICENSE("Dual BSD/GPL"); /* zstd / misc */
MODULE_LICENSE(ZFS_META_LICENSE);
MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
index eb7c5f6166d2..e2431fe8a803 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -279,11 +279,11 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
for (int i = 0; i < ARRAY_SIZE(type_map); i++) {
if (type_map[i].ztm_type & property->pd_types) {
- len += snprintf(buf + len, buflen - len, "%s ",
- type_map[i].ztm_name);
+ len += kmem_scnprintf(buf + len, buflen - len,
+ "%s ", type_map[i].ztm_name);
}
}
- len += snprintf(buf + len, buflen - len, "\n");
+ len += kmem_scnprintf(buf + len, buflen - len, "\n");
return (len);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
index abb6dbe67cdf..c2ed67c438c6 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
this_seg_start = orig_loffset;
rq_for_each_segment(bv, rq, iter) {
- if (uio->iter.bio) {
- /*
- * If uio->iter.bio is present, then we know we've saved
- * uio->iter from a previous call to this function, and
- * we can skip ahead in this rq_for_each_segment() loop
- * to where we last left off. That way, we don't need
- * to iterate over tons of segments we've already
- * processed - we can just restore the "saved state".
- */
- iter = uio->iter;
- bv = uio->bv;
- this_seg_start = uio->uio_loffset;
- memset(&uio->iter, 0, sizeof (uio->iter));
- continue;
- }
-
/*
* Lookup what the logical offset of the last byte of this
* segment is.
@@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
copied = 1; /* We copied some data */
}
- if (n == 0) {
- /*
- * All done copying. Save our 'iter' value to the uio.
- * This allows us to "save our state" and skip ahead in
- * the rq_for_each_segment() loop the next time we call
- * call zfs_uiomove_bvec_rq() on this uio (which we
- * will be doing for any remaining data in the uio).
- */
- uio->iter = iter; /* make a copy of the struct data */
- uio->bv = bv;
- return (0);
- }
-
this_seg_start = this_seg_end + 1;
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index a67ba821d06f..2015c20d7340 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -56,7 +56,6 @@
#include <sys/sunddi.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_dir.h>
-#include <sys/spa_boot.h>
#include <sys/objlist.h>
#include <sys/zpl.h>
#include <linux/vfs_compat.h>
@@ -274,8 +273,10 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
* Sync a specific filesystem.
*/
dsl_pool_t *dp;
+ int error;
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
dp = dmu_objset_pool(zfsvfs->z_os);
/*
@@ -283,14 +284,14 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
* filesystems which may exist on a suspended pool.
*/
if (spa_suspended(dp->dp_spa)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, 0);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
} else {
/*
* Sync all ZFS filesystems. This is what happens when you
@@ -607,7 +608,8 @@ zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
}
if (tmp != *val) {
- (void) strcpy(setpoint, "temporary");
+ if (setpoint)
+ (void) strcpy(setpoint, "temporary");
*val = tmp;
}
return (0);
@@ -783,9 +785,7 @@ zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
}
error = zfsvfs_create_impl(zfvp, zfsvfs, os);
- if (error != 0) {
- dmu_objset_disown(os, B_TRUE, zfsvfs);
- }
+
return (error);
}
@@ -825,6 +825,7 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
*zfvp = NULL;
zfsvfs_free(zfsvfs);
return (error);
@@ -848,8 +849,6 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
if (error)
return (error);
- zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-
/*
* If we are not mounting (ie: online recv), then we don't
* have to worry about replaying the log as we blocked all
@@ -857,7 +856,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
*/
if (mounting) {
ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
- dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+ error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+ if (error)
+ return (error);
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
+ &zfsvfs->z_kstat.dk_zil_sums);
/*
* During replay we remove the read only flag to
@@ -921,6 +924,10 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
/* restore readonly bit */
if (readonly != 0)
readonly_changed_cb(zfsvfs, B_TRUE);
+ } else {
+ ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
+ &zfsvfs->z_kstat.dk_zil_sums);
}
/*
@@ -1087,7 +1094,8 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp)
uint64_t refdbytes, availbytes, usedobjs, availobjs;
int err = 0;
- ZFS_ENTER(zfsvfs);
+ if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (err);
dmu_objset_space(zfsvfs->z_os,
&refdbytes, &availbytes, &usedobjs, &availobjs);
@@ -1148,7 +1156,7 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp)
err = zfs_statfs_project(zfsvfs, zp, statp, bshift);
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (err);
}
@@ -1158,13 +1166,14 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
znode_t *rootzp;
int error;
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
if (error == 0)
*ipp = ZTOI(rootzp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1185,7 +1194,7 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
int objects = 0;
int i = 0, j = 0;
- zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+ zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
mutex_enter(&zfsvfs->z_znodes_lock);
while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
@@ -1221,7 +1230,7 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
zrele(zp);
}
- kmem_free(zp_array, max_array * sizeof (znode_t *));
+ vmem_free(zp_array, max_array * sizeof (znode_t *));
return (objects);
}
@@ -1231,23 +1240,30 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
* and inode caches. This can occur when the ARC needs to free meta data
* blocks but can't because they are all pinned by entries in these caches.
*/
+#if defined(HAVE_SUPER_BLOCK_S_SHRINK)
+#define S_SHRINK(sb) (&(sb)->s_shrink)
+#elif defined(HAVE_SUPER_BLOCK_S_SHRINK_PTR)
+#define S_SHRINK(sb) ((sb)->s_shrink)
+#endif
+
int
zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
{
zfsvfs_t *zfsvfs = sb->s_fs_info;
int error = 0;
- struct shrinker *shrinker = &sb->s_shrink;
+ struct shrinker *shrinker = S_SHRINK(sb);
struct shrink_control sc = {
.nr_to_scan = nr_to_scan,
.gfp_mask = GFP_KERNEL,
};
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
defined(SHRINK_CONTROL_HAS_NID) && \
defined(SHRINKER_NUMA_AWARE)
- if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
+ if (shrinker->flags & SHRINKER_NUMA_AWARE) {
*objects = 0;
for_each_online_node(sc.nid) {
*objects += (*shrinker->scan_objects)(shrinker, &sc);
@@ -1283,7 +1299,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
#endif
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
"pruning, nr_to_scan=%lu objects=%d error=%d\n",
@@ -1320,12 +1336,11 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
* may add the parents of dir-based xattrs to the taskq
* so we want to wait for these.
*
- * We can safely read z_nr_znodes without locking because the
- * VFS has already blocked operations which add to the
- * z_all_znodes list and thus increment z_nr_znodes.
+ * We can safely check z_all_znodes for being empty because the
+ * VFS has already blocked operations which add to it.
*/
int round = 0;
- while (zfsvfs->z_nr_znodes > 0) {
+ while (!list_is_empty(&zfsvfs->z_all_znodes)) {
taskq_wait_outstanding(dsl_pool_zrele_taskq(
dmu_objset_pool(zfsvfs->z_os)), 0);
if (++round > 1 && !unmounting)
@@ -1479,7 +1494,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
* read-only flag, pretend it was set, as done for snapshots.
*/
if (!canwrite)
- vfs->vfs_readonly = true;
+ vfs->vfs_readonly = B_TRUE;
error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
if (error) {
@@ -1513,7 +1528,6 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
sb->s_op = &zpl_super_operations;
sb->s_xattr = zpl_xattr_handlers;
sb->s_export_op = &zpl_export_operations;
- sb->s_d_op = &zpl_dentry_operations;
/* Set features for file system. */
zfs_set_fuid_feature(zfsvfs);
@@ -1547,6 +1561,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
error = zfs_root(zfsvfs, &root_inode);
if (error) {
(void) zfs_umount(sb);
+ zfsvfs = NULL; /* avoid double-free; first in zfs_umount */
goto out;
}
@@ -1554,6 +1569,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
sb->s_root = d_make_root(root_inode);
if (sb->s_root == NULL) {
(void) zfs_umount(sb);
+ zfsvfs = NULL; /* avoid double-free; first in zfs_umount */
error = SET_ERROR(ENOMEM);
goto out;
}
@@ -1651,6 +1667,7 @@ zfs_umount(struct super_block *sb)
}
zfsvfs_free(zfsvfs);
+ sb->s_fs_info = NULL;
return (0);
}
@@ -1740,7 +1757,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp));
}
- ZFS_ENTER(zfsvfs);
+ if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (err);
/* A zero fid_gen means we are in the .zfs control directories */
if (fid_gen == 0 &&
(object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
@@ -1756,7 +1774,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
*/
VERIFY3P(igrab(*ipp), !=, NULL);
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -1764,14 +1782,14 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask);
if ((err = zfs_zget(zfsvfs, object, &zp))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (err);
}
/* Don't export xattr stuff */
if (zp->z_pflags & ZFS_XATTR) {
zrele(zp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOENT));
}
@@ -1786,7 +1804,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen,
fid_gen);
zrele(zp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOENT));
}
@@ -1794,7 +1812,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
if (*ipp)
zfs_znode_update_vfs(ITOZ(*ipp));
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -1869,8 +1887,8 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
err2 = zfs_rezget(zp);
if (err2) {
+ zpl_d_drop_aliases(ZTOI(zp));
remove_inode_hash(ZTOI(zp));
- zp->z_is_stale = B_TRUE;
}
/* see comment in zfs_suspend_fs() */
@@ -2041,91 +2059,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
}
/*
- * Read a property stored within the master node.
- */
-int
-zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
-{
- uint64_t *cached_copy = NULL;
-
- /*
- * Figure out where in the objset_t the cached copy would live, if it
- * is available for the requested property.
- */
- if (os != NULL) {
- switch (prop) {
- case ZFS_PROP_VERSION:
- cached_copy = &os->os_version;
- break;
- case ZFS_PROP_NORMALIZE:
- cached_copy = &os->os_normalization;
- break;
- case ZFS_PROP_UTF8ONLY:
- cached_copy = &os->os_utf8only;
- break;
- case ZFS_PROP_CASE:
- cached_copy = &os->os_casesensitivity;
- break;
- default:
- break;
- }
- }
- if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
- *value = *cached_copy;
- return (0);
- }
-
- /*
- * If the property wasn't cached, look up the file system's value for
- * the property. For the version property, we look up a slightly
- * different string.
- */
- const char *pname;
- int error = ENOENT;
- if (prop == ZFS_PROP_VERSION)
- pname = ZPL_VERSION_STR;
- else
- pname = zfs_prop_to_name(prop);
-
- if (os != NULL) {
- ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
- error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
- }
-
- if (error == ENOENT) {
- /* No value set, use the default value */
- switch (prop) {
- case ZFS_PROP_VERSION:
- *value = ZPL_VERSION;
- break;
- case ZFS_PROP_NORMALIZE:
- case ZFS_PROP_UTF8ONLY:
- *value = 0;
- break;
- case ZFS_PROP_CASE:
- *value = ZFS_CASE_SENSITIVE;
- break;
- case ZFS_PROP_ACLTYPE:
- *value = ZFS_ACLTYPE_OFF;
- break;
- default:
- return (error);
- }
- error = 0;
- }
-
- /*
- * If one of the methods for getting the property value above worked,
- * copy it into the objset_t's cache.
- */
- if (error == 0 && cached_copy != NULL) {
- *cached_copy = *value;
- }
-
- return (error);
-}
-
-/*
* Return true if the corresponding vfs's unmounted flag is set.
* Otherwise return false.
* If this function returns true we know VFS unmount has been initiated.
@@ -2164,6 +2097,9 @@ zfs_init(void)
zfs_znode_init();
dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
register_filesystem(&zpl_fs_type);
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+ register_fo_extend(&zpl_file_operations);
+#endif
}
void
@@ -2174,6 +2110,9 @@ zfs_fini(void)
*/
taskq_wait(system_delay_taskq);
taskq_wait(system_taskq);
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+ unregister_fo_extend(&zpl_file_operations);
+#endif
unregister_filesystem(&zpl_fs_type);
zfs_znode_fini();
zfsctl_fini();
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index d6ff838806eb..1cecad9f7755 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -82,13 +82,13 @@
* to freed memory. The example below illustrates the following Big Rules:
*
* (1) A check must be made in each zfs thread for a mounted file system.
- * This is done avoiding races using ZFS_ENTER(zfsvfs).
- * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
- * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
+ * This is done avoiding races using zfs_enter(zfsvfs).
+ * A zfs_exit(zfsvfs) is needed before all returns. Any znodes
+ * must be checked with zfs_verify_zp(zp). Both of these macros
* can return EIO from the calling function.
*
* (2) zrele() should always be the last thing except for zil_commit() (if
- * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the
+ * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
* last reference, the vnode/znode can be freed, so the zp may point to
* freed memory. Second, the last reference will call zfs_zinactive(),
* which may induce a lot of work -- pushing cached pages (which acquires
@@ -107,7 +107,7 @@
* dmu_tx_assign(). This is critical because we don't want to block
* while holding locks.
*
- * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
+ * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This
* reduces lock contention and CPU usage when we must wait (note that if
* throughput is constrained by the storage, nearly every transaction
* must wait).
@@ -142,7 +142,7 @@
*
* In general, this is how things should be ordered in each vnode op:
*
- * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * zfs_enter(zfsvfs); // exit if unmounted
* top:
* zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
* rw_enter(...); // grab any other locks you need
@@ -160,7 +160,7 @@
* goto top;
* }
* dmu_tx_abort(tx); // abort DMU tx
- * ZFS_EXIT(zfsvfs); // finished in zfs
+ * zfs_exit(zfsvfs); // finished in zfs
* return (error); // really out of space
* }
* error = do_real_work(); // do whatever this VOP does
@@ -171,7 +171,7 @@
* zfs_dirent_unlock(dl); // unlock directory entry
* zrele(...); // release held znodes
* zil_commit(zilog, foid); // synchronous when necessary
- * ZFS_EXIT(zfsvfs); // finished in zfs
+ * zfs_exit(zfsvfs); // finished in zfs
* return (error); // done, report error
*/
int
@@ -180,22 +180,29 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
(void) cr;
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
/* Honor ZFS_APPENDONLY file attribute */
- if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+ if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
((flag & O_APPEND) == 0)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
}
- /* Keep a count of the synchronous opens in the znode */
- if (flag & O_SYNC)
- atomic_inc_32(&zp->z_sync_cnt);
+ /*
+ * Keep a count of the synchronous opens in the znode. On first
+ * synchronous open we must convert all previous async transactions
+ * into sync to keep correct ordering.
+ */
+ if (flag & O_SYNC) {
+ if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
+ zil_async_to_sync(zfsvfs->z_log, zp->z_id);
+ }
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -205,56 +212,60 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
(void) cr;
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
/* Decrement the synchronous opens in the znode */
if (flag & O_SYNC)
atomic_dec_32(&zp->z_sync_cnt);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
#if defined(_KERNEL)
+
+static int zfs_fillpage(struct inode *ip, struct page *pp);
+
/*
* When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Write: If we find a memory mapped page, we write to *both*
- * the page and the dmu buffer.
+ * between the DMU cache and the memory mapped pages. Update all mapped
+ * pages with the contents of the coresponding dmu buffer.
*/
void
update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
{
- struct inode *ip = ZTOI(zp);
- struct address_space *mp = ip->i_mapping;
- struct page *pp;
- uint64_t nbytes;
- int64_t off;
- void *pb;
+ struct address_space *mp = ZTOI(zp)->i_mapping;
+ int64_t off = start & (PAGE_SIZE - 1);
- off = start & (PAGE_SIZE-1);
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
- nbytes = MIN(PAGE_SIZE - off, len);
+ uint64_t nbytes = MIN(PAGE_SIZE - off, len);
- pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
if (pp) {
if (mapping_writably_mapped(mp))
flush_dcache_page(pp);
- pb = kmap(pp);
- (void) dmu_read(os, zp->z_id, start + off, nbytes,
- pb + off, DMU_READ_PREFETCH);
+ void *pb = kmap(pp);
+ int error = dmu_read(os, zp->z_id, start + off,
+ nbytes, pb + off, DMU_READ_PREFETCH);
kunmap(pp);
- if (mapping_writably_mapped(mp))
- flush_dcache_page(pp);
+ if (error) {
+ SetPageError(pp);
+ ClearPageUptodate(pp);
+ } else {
+ ClearPageError(pp);
+ SetPageUptodate(pp);
+
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ mark_page_accessed(pp);
+ }
- mark_page_accessed(pp);
- SetPageUptodate(pp);
- ClearPageError(pp);
unlock_page(pp);
put_page(pp);
}
@@ -265,38 +276,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
}
/*
- * When a file is memory mapped, we must keep the IO data synchronized
- * between the DMU cache and the memory mapped pages. What this means:
- *
- * On Read: We "read" preferentially from memory mapped pages,
- * else we default from the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
+ * When a file is memory mapped, we must keep the I/O data synchronized
+ * between the DMU cache and the memory mapped pages. Preferentially read
+ * from memory mapped pages, otherwise fallback to reading through the dmu.
*/
int
mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
{
struct inode *ip = ZTOI(zp);
struct address_space *mp = ip->i_mapping;
- struct page *pp;
- int64_t start, off;
- uint64_t bytes;
+ int64_t start = uio->uio_loffset;
+ int64_t off = start & (PAGE_SIZE - 1);
int len = nbytes;
int error = 0;
- void *pb;
- start = uio->uio_loffset;
- off = start & (PAGE_SIZE-1);
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
- bytes = MIN(PAGE_SIZE - off, len);
+ uint64_t bytes = MIN(PAGE_SIZE - off, len);
- pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
if (pp) {
- ASSERT(PageUptodate(pp));
+ /*
+ * If filemap_fault() retries there exists a window
+ * where the page will be unlocked and not up to date.
+ * In this case we must try and fill the page.
+ */
+ if (unlikely(!PageUptodate(pp))) {
+ error = zfs_fillpage(ip, pp);
+ if (error) {
+ unlock_page(pp);
+ put_page(pp);
+ return (error);
+ }
+ }
+
+ ASSERT(PageUptodate(pp) || PageDirty(pp));
+
unlock_page(pp);
- pb = kmap(pp);
+ void *pb = kmap(pp);
error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
kunmap(pp);
@@ -312,9 +329,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
len -= bytes;
off = 0;
+
if (error)
break;
}
+
return (error);
}
#endif /* _KERNEL */
@@ -449,8 +468,8 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
}
}
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zdp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
+ return (error);
*zpp = NULL;
@@ -460,12 +479,12 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
* Maybe someday we will.
*/
if (zdp->z_pflags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -474,17 +493,17 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
*/
if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
- B_TRUE, cr))) {
+ B_TRUE, cr, zfs_init_idmap))) {
zrele(*zpp);
*zpp = NULL;
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOTDIR));
}
@@ -492,14 +511,15 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
* Check accessibility of directory.
*/
- if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
- ZFS_EXIT(zfsvfs);
+ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
+ zfs_init_idmap))) {
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EILSEQ));
}
@@ -507,7 +527,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
if ((error == 0) && (*zpp))
zfs_znode_update_vfs(*zpp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -524,6 +544,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
* cr - credentials of caller.
* flag - file flag.
* vsecp - ACL to be set
+ * mnt_ns - user namespace of the mount
*
* OUT: zpp - znode of created or trunc'd entry.
*
@@ -535,7 +556,8 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
*/
int
zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
- int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+ int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
+ zidmap_t *mnt_ns)
{
znode_t *zp;
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
@@ -550,6 +572,7 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
boolean_t fuid_dirtied;
boolean_t have_acl = B_FALSE;
boolean_t waited = B_FALSE;
+ boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
/*
* If we have an ephemeral id, ACL, or XVATTR then
@@ -566,21 +589,21 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
if (name == NULL)
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
+ return (error);
os = zfsvfs->z_os;
zilog = zfsvfs->z_log;
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EILSEQ));
}
if (vap->va_mask & ATTR_XVATTR) {
if ((error = secpolicy_xvattr((xvattr_t *)vap,
crgetuid(cr), cr, vap->va_mode)) != 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
}
@@ -609,7 +632,7 @@ top:
zfs_acl_ids_free(&acl_ids);
if (strcmp(name, "..") == 0)
error = SET_ERROR(EISDIR);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
}
@@ -622,7 +645,8 @@ top:
* Create a new file object and update the directory
* to reference it.
*/
- if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
+ mnt_ns))) {
if (have_acl)
zfs_acl_ids_free(&acl_ids);
goto out;
@@ -641,7 +665,7 @@ top:
}
if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
- cr, vsecp, &acl_ids)) != 0)
+ cr, vsecp, &acl_ids, mnt_ns)) != 0)
goto out;
have_acl = B_TRUE;
@@ -681,7 +705,7 @@ top:
}
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
@@ -714,7 +738,6 @@ top:
if (have_acl)
zfs_acl_ids_free(&acl_ids);
- have_acl = B_FALSE;
/*
* A directory entry already exists for this name.
@@ -736,7 +759,8 @@ top:
/*
* Verify requested access to file.
*/
- if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
+ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
+ mnt_ns))) {
goto out;
}
@@ -774,13 +798,14 @@ out:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
int
zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
- int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
+ int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
+ zidmap_t *mnt_ns)
{
(void) excl, (void) mode, (void) flag;
znode_t *zp = NULL, *dzp = ITOZ(dip);
@@ -808,14 +833,14 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
(vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
+ return (error);
os = zfsvfs->z_os;
if (vap->va_mask & ATTR_XVATTR) {
if ((error = secpolicy_xvattr((xvattr_t *)vap,
crgetuid(cr), cr, vap->va_mode)) != 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
}
@@ -827,14 +852,14 @@ top:
* Create a new file object and update the directory
* to reference it.
*/
- if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
if (have_acl)
zfs_acl_ids_free(&acl_ids);
goto out;
}
if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
- cr, vsecp, &acl_ids)) != 0)
+ cr, vsecp, &acl_ids, mnt_ns)) != 0)
goto out;
have_acl = B_TRUE;
@@ -870,7 +895,7 @@ top:
}
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
@@ -894,7 +919,7 @@ out:
*ipp = ZTOI(zp);
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -941,8 +966,8 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
if (name == NULL)
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
+ return (error);
zilog = zfsvfs->z_log;
if (flags & FIGNORECASE) {
@@ -961,11 +986,11 @@ top:
NULL, realnmp))) {
if (realnmp)
pn_free(realnmp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
- if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
goto out;
}
@@ -979,7 +1004,7 @@ top:
mutex_enter(&zp->z_lock);
may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
- !(zp->z_is_mapped);
+ !zn_has_cached_data(zp, 0, LLONG_MAX);
mutex_exit(&zp->z_lock);
/*
@@ -1042,7 +1067,7 @@ top:
zrele(zp);
if (xzp)
zrele(xzp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1067,8 +1092,10 @@ top:
&xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
delete_now = may_delete_now && !toobig &&
atomic_read(&ZTOI(zp)->i_count) == 1 &&
- !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
+ !zn_has_cached_data(zp, 0, LLONG_MAX) &&
+ xattr_obj == xattr_obj_unlinked &&
zfs_external_acl(zp) == acl_obj;
+ VERIFY_IMPLY(xattr_obj_unlinked, xzp);
}
if (delete_now) {
@@ -1131,7 +1158,7 @@ out:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1145,6 +1172,7 @@ out:
* cr - credentials of caller.
* flags - case flags.
* vsecp - ACL to be set
+ * mnt_ns - user namespace of the mount
*
* OUT: zpp - znode of created directory.
*
@@ -1157,7 +1185,7 @@ out:
*/
int
zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
- cred_t *cr, int flags, vsecattr_t *vsecp)
+ cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
{
znode_t *zp;
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
@@ -1188,18 +1216,18 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
if (dirname == NULL)
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
+ return (error);
zilog = zfsvfs->z_log;
if (dzp->z_pflags & ZFS_XATTR) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
if (zfsvfs->z_utf8 && u8_validate(dirname,
strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EILSEQ));
}
if (flags & FIGNORECASE)
@@ -1208,14 +1236,14 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
if (vap->va_mask & ATTR_XVATTR) {
if ((error = secpolicy_xvattr((xvattr_t *)vap,
crgetuid(cr), cr, vap->va_mode)) != 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
}
if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
- vsecp, &acl_ids)) != 0) {
- ZFS_EXIT(zfsvfs);
+ vsecp, &acl_ids, mnt_ns)) != 0) {
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
/*
@@ -1231,21 +1259,22 @@ top:
if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
NULL, NULL))) {
zfs_acl_ids_free(&acl_ids);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
- if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+ if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
+ mnt_ns))) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EDQUOT));
}
@@ -1277,7 +1306,7 @@ top:
}
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1323,7 +1352,7 @@ out:
zfs_znode_update_vfs(dzp);
zfs_znode_update_vfs(zp);
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1359,8 +1388,8 @@ zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
if (name == NULL)
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
+ return (error);
zilog = zfsvfs->z_log;
if (flags & FIGNORECASE)
@@ -1373,11 +1402,11 @@ top:
*/
if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
NULL, NULL))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
- if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
goto out;
}
@@ -1424,7 +1453,7 @@ top:
}
dmu_tx_abort(tx);
zrele(zp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1452,7 +1481,7 @@ out:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1491,8 +1520,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
uint64_t parent;
uint64_t offset; /* must be unsigned; checks for < 1 */
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (parent))) != 0)
@@ -1587,11 +1616,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
if (done)
break;
- /* Prefetch znode */
- if (prefetch) {
- dmu_prefetch(os, objnum, 0, 0, 0,
- ZIO_PRIORITY_SYNC_READ);
- }
+ if (prefetch)
+ dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
/*
* Move to the next entry, fill in the previous offset.
@@ -1611,7 +1637,7 @@ update:
if (error == ENOENT)
error = 0;
out:
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -1629,20 +1655,29 @@ out:
* RETURN: 0 (always succeeds)
*/
int
-zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip,
+#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
+zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
struct kstat *sp)
+#else
+zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
+#endif
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
uint32_t blksize;
u_longlong_t nblocks;
+ int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
mutex_enter(&zp->z_lock);
+#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
+ zpl_generic_fillattr(user_ns, request_mask, ip, sp);
+#else
zpl_generic_fillattr(user_ns, ip, sp);
+#endif
/*
* +1 link count for root inode with visible '.zfs' directory.
*/
@@ -1673,7 +1708,7 @@ zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip,
dmu_objset_id(zfsvfs->z_os);
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -1808,6 +1843,7 @@ next:
* flags - ATTR_UTIME set if non-default time values provided.
* - ATTR_NOACLCHECK (CIFS context only).
* cr - credentials of caller.
+ * mnt_ns - user namespace of the mount
*
* RETURN: 0 if success
* error code if failure
@@ -1816,11 +1852,11 @@ next:
* ip - ctime updated, mtime updated if size changed.
*/
int
-zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
{
struct inode *ip;
zfsvfs_t *zfsvfs = ZTOZSB(zp);
- objset_t *os = zfsvfs->z_os;
+ objset_t *os;
zilog_t *zilog;
dmu_tx_t *tx;
vattr_t oldva;
@@ -1849,9 +1885,10 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
if (mask == 0)
return (0);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (err);
ip = ZTOI(zp);
+ os = zfsvfs->z_os;
/*
* If this is a xvattr_t, then get a pointer to the structure of
@@ -1862,13 +1899,13 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
if (!dmu_objset_projectquota_enabled(os) ||
(!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOTSUP));
}
projid = xoap->xoa_projid;
if (unlikely(projid == ZFS_INVALID_PROJID)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -1883,7 +1920,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
(!dmu_objset_projectquota_enabled(os) ||
(!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOTSUP));
}
}
@@ -1899,17 +1936,17 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
(((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
(mask & ATTR_XVATTR))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EISDIR));
}
if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -1965,7 +2002,8 @@ top:
*/
if (mask & ATTR_SIZE) {
- err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
+ err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
+ mnt_ns);
if (err)
goto out3;
@@ -1990,13 +2028,15 @@ top:
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
- skipaclchk, cr);
+ skipaclchk, cr, mnt_ns);
}
if (mask & (ATTR_UID|ATTR_GID)) {
int idmask = (mask & (ATTR_UID|ATTR_GID));
int take_owner;
int take_group;
+ uid_t uid;
+ gid_t gid;
/*
* NOTE: even if a new mode is being set,
@@ -2010,9 +2050,13 @@ top:
* Take ownership or chgrp to group we are a member of
*/
- take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
+ uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
+ vap->va_uid);
+ gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
+ vap->va_gid);
+ take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
take_group = (mask & ATTR_GID) &&
- zfs_groupmember(zfsvfs, vap->va_gid, cr);
+ zfs_groupmember(zfsvfs, gid, cr);
/*
* If both ATTR_UID and ATTR_GID are set then take_owner and
@@ -2028,7 +2072,7 @@ top:
((idmask == ATTR_UID) && take_owner) ||
((idmask == ATTR_GID) && take_group)) {
if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
- skipaclchk, cr) == 0) {
+ skipaclchk, cr, mnt_ns) == 0) {
/*
* Remove setuid/setgid for non-privileged users
*/
@@ -2141,12 +2185,12 @@ top:
mutex_exit(&zp->z_lock);
if (mask & ATTR_MODE) {
- if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+ if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
+ mnt_ns) == 0) {
err = secpolicy_setid_setsticky_clear(ip, vap,
- &oldva, cr);
+ &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
if (err)
goto out3;
-
trim_mask |= ATTR_MODE;
} else {
need_policy = TRUE;
@@ -2167,7 +2211,7 @@ top:
vap->va_mask &= ~trim_mask;
}
err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
- (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+ zfs_zaccess_unix, zp);
if (err)
goto out3;
@@ -2395,15 +2439,16 @@ top:
if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
zp->z_atime_dirty = B_FALSE;
- ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
+ ZFS_TIME_ENCODE(&tmp_atime, atime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
&atime, sizeof (atime));
}
if (mask & (ATTR_MTIME | ATTR_SIZE)) {
ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
- ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
- vap->va_mtime, ZTOI(zp));
+ zpl_inode_set_mtime_to_ts(ZTOI(zp),
+ zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
mtime, sizeof (mtime));
@@ -2411,8 +2456,8 @@ top:
if (mask & (ATTR_CTIME | ATTR_SIZE)) {
ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
- ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
- ZTOI(zp));
+ zpl_inode_set_ctime_to_ts(ZTOI(zp),
+ zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
ctime, sizeof (ctime));
}
@@ -2512,7 +2557,7 @@ out:
dmu_tx_commit(tx);
if (attrzp) {
if (err2 == 0 && handle_eadir)
- err2 = zfs_setattr_dir(attrzp);
+ err = zfs_setattr_dir(attrzp);
zrele(attrzp);
}
zfs_znode_update_vfs(zp);
@@ -2526,7 +2571,7 @@ out3:
kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
kmem_free(tmpxvattr, sizeof (xvattr_t));
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (err);
}
@@ -2637,6 +2682,9 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
* tnm - New entry name.
* cr - credentials of caller.
* flags - case flags
+ * rflags - RENAME_* flags
+ * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0).
+ * mnt_ns - user namespace of the mount
*
* RETURN: 0 on success, error code on failure.
*
@@ -2645,7 +2693,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
*/
int
zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
- cred_t *cr, int flags)
+ cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
{
znode_t *szp, *tzp;
zfsvfs_t *zfsvfs = ZTOZSB(sdzp);
@@ -2657,15 +2705,41 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
int error = 0;
int zflg = 0;
boolean_t waited = B_FALSE;
+ /* Needed for whiteout inode creation. */
+ boolean_t fuid_dirtied;
+ zfs_acl_ids_t acl_ids;
+ boolean_t have_acl = B_FALSE;
+ znode_t *wzp = NULL;
+
if (snm == NULL || tnm == NULL)
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(sdzp);
+ if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+ return (SET_ERROR(EINVAL));
+
+ /* Already checked by Linux VFS, but just to make sure. */
+ if (rflags & RENAME_EXCHANGE &&
+ (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
+ * right kind of vattr_t for the whiteout file. These are set
+ * internally by ZFS so should never be incorrect.
+ */
+ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
+ VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
+ VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
+
+ if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
+ return (error);
zilog = zfsvfs->z_log;
- ZFS_VERIFY_ZP(tdzp);
+ if ((error = zfs_verify_zp(tdzp)) != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
/*
* We check i_sb because snapshots and the ctldir must have different
@@ -2673,13 +2747,13 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
*/
if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
zfsctl_is_node(ZTOI(tdzp))) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EXDEV));
}
if (zfsvfs->z_utf8 && u8_validate(tnm,
strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EILSEQ));
}
@@ -2697,7 +2771,7 @@ top:
* See the comment in zfs_link() for why this is considered bad.
*/
if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -2727,7 +2801,7 @@ top:
* the rename() function shall return successfully
* and perform no other action."
*/
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
/*
@@ -2799,7 +2873,7 @@ top:
if (strcmp(snm, "..") == 0)
serr = EINVAL;
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (serr);
}
if (terr) {
@@ -2811,7 +2885,7 @@ top:
if (strcmp(tnm, "..") == 0)
terr = EINVAL;
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (terr);
}
@@ -2834,8 +2908,7 @@ top:
* Note that if target and source are the same, this can be
* done in a single check.
*/
-
- if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
goto out;
if (S_ISDIR(ZTOI(szp)->i_mode)) {
@@ -2851,17 +2924,19 @@ top:
* Does target exist?
*/
if (tzp) {
+ if (rflags & RENAME_NOREPLACE) {
+ error = SET_ERROR(EEXIST);
+ goto out;
+ }
/*
- * Source and target must be the same type.
+ * Source and target must be the same type (unless exchanging).
*/
- if (S_ISDIR(ZTOI(szp)->i_mode)) {
- if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
- error = SET_ERROR(ENOTDIR);
- goto out;
- }
- } else {
- if (S_ISDIR(ZTOI(tzp)->i_mode)) {
- error = SET_ERROR(EISDIR);
+ if (!(rflags & RENAME_EXCHANGE)) {
+ boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
+ boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
+
+ if (s_is_dir != t_is_dir) {
+ error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
goto out;
}
}
@@ -2874,12 +2949,43 @@ top:
error = 0;
goto out;
}
+ } else if (rflags & RENAME_EXCHANGE) {
+ /* Target must exist for RENAME_EXCHANGE. */
+ error = SET_ERROR(ENOENT);
+ goto out;
+ }
+
+ /* Set up inode creation for RENAME_WHITEOUT. */
+ if (rflags & RENAME_WHITEOUT) {
+ /*
+ * Whiteout files are not regular files or directories, so to
+ * match zfs_create() we do not inherit the project id.
+ */
+ uint64_t wo_projid = ZFS_DEFAULT_PROJID;
+
+ error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
+ if (error)
+ goto out;
+
+ if (!have_acl) {
+ error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
+ &acl_ids, mnt_ns);
+ if (error)
+ goto out;
+ have_acl = B_TRUE;
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
}
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
- dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, sdzp->z_id,
+ (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
if (sdzp != tdzp) {
dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
@@ -2889,7 +2995,21 @@ top:
dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, tzp);
}
+ if (rflags & RENAME_WHITEOUT) {
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ }
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
@@ -2915,62 +3035,114 @@ top:
zrele(szp);
if (tzp)
zrele(tzp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
- if (tzp) /* Attempt to remove the existing target */
- error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+ /*
+ * Unlink the source.
+ */
+ szp->z_pflags |= ZFS_AV_MODIFIED;
+ if (tdzp->z_pflags & ZFS_PROJINHERIT)
+ szp->z_pflags |= ZFS_PROJINHERIT;
- if (error == 0) {
- error = zfs_link_create(tdl, szp, tx, ZRENAMING);
- if (error == 0) {
- szp->z_pflags |= ZFS_AV_MODIFIED;
- if (tdzp->z_pflags & ZFS_PROJINHERIT)
- szp->z_pflags |= ZFS_PROJINHERIT;
-
- error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
- (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ VERIFY0(error);
+
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ if (error)
+ goto commit;
+
+ /*
+ * Unlink the target.
+ */
+ if (tzp) {
+ int tzflg = zflg;
+
+ if (rflags & RENAME_EXCHANGE) {
+ /* This inode will be re-linked soon. */
+ tzflg |= ZRENAMING;
+
+ tzp->z_pflags |= ZFS_AV_MODIFIED;
+ if (sdzp->z_pflags & ZFS_PROJINHERIT)
+ tzp->z_pflags |= ZFS_PROJINHERIT;
+
+ error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
ASSERT0(error);
+ }
+ error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
+ if (error)
+ goto commit_link_szp;
+ }
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
- if (error == 0) {
- zfs_log_rename(zilog, tx, TX_RENAME |
- (flags & FIGNORECASE ? TX_CI : 0), sdzp,
- sdl->dl_name, tdzp, tdl->dl_name, szp);
- } else {
- /*
- * At this point, we have successfully created
- * the target name, but have failed to remove
- * the source name. Since the create was done
- * with the ZRENAMING flag, there are
- * complications; for one, the link count is
- * wrong. The easiest way to deal with this
- * is to remove the newly created target, and
- * return the original error. This must
- * succeed; fortunately, it is very unlikely to
- * fail, since we just created it.
- */
- VERIFY3U(zfs_link_destroy(tdl, szp, tx,
- ZRENAMING, NULL), ==, 0);
- }
- } else {
- /*
- * If we had removed the existing target, subsequent
- * call to zfs_link_create() to add back the same entry
- * but, the new dnode (szp) should not fail.
- */
- ASSERT(tzp == NULL);
+ /*
+ * Create the new target links:
+ * * We always link the target.
+ * * RENAME_EXCHANGE: Link the old target to the source.
+ * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
+ */
+ error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ if (error) {
+ /*
+ * If we have removed the existing target, a subsequent call to
+ * zfs_link_create() to add back the same entry, but with a new
+ * dnode (szp), should not fail.
+ */
+ ASSERT3P(tzp, ==, NULL);
+ goto commit_link_tzp;
+ }
+
+ switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
+ case RENAME_EXCHANGE:
+ error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
+ /*
+ * The same argument as zfs_link_create() failing for
+ * szp applies here, since the source directory must
+ * have had an entry we are replacing.
+ */
+ ASSERT0(error);
+ if (error)
+ goto commit_unlink_td_szp;
+ break;
+ case RENAME_WHITEOUT:
+ zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
+ error = zfs_link_create(sdl, wzp, tx, ZNEW);
+ if (error) {
+ zfs_znode_delete(wzp, tx);
+ remove_inode_hash(ZTOI(wzp));
+ goto commit_unlink_td_szp;
}
+ break;
}
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
+ case RENAME_EXCHANGE:
+ zfs_log_rename_exchange(zilog, tx,
+ (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
+ tdzp, tdl->dl_name, szp);
+ break;
+ case RENAME_WHITEOUT:
+ zfs_log_rename_whiteout(zilog, tx,
+ (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
+ tdzp, tdl->dl_name, szp, wzp);
+ break;
+ default:
+ ASSERT0(rflags & ~RENAME_NOREPLACE);
+ zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
+ sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+ break;
+ }
+
+commit:
dmu_tx_commit(tx);
out:
- if (zl != NULL)
- zfs_rename_unlock(&zl);
-
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
zfs_znode_update_vfs(sdzp);
if (sdzp == tdzp)
@@ -2981,16 +3153,57 @@ out:
zfs_znode_update_vfs(szp);
zrele(szp);
+ if (wzp) {
+ zfs_znode_update_vfs(wzp);
+ zrele(wzp);
+ }
if (tzp) {
zfs_znode_update_vfs(tzp);
zrele(tzp);
}
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
+
+ /*
+ * Clean-up path for broken link state.
+ *
+ * At this point we are in a (very) bad state, so we need to do our
+ * best to correct the state. In particular, all of the nlinks are
+ * wrong because we were destroying and creating links with ZRENAMING.
+ *
+ * In some form, all of these operations have to resolve the state:
+ *
+ * * link_destroy() *must* succeed. Fortunately, this is very likely
+ * since we only just created it.
+ *
+ * * link_create()s are allowed to fail (though they shouldn't because
+ * we only just unlinked them and are putting the entries back
+ * during clean-up). But if they fail, we can just forcefully drop
+ * the nlink value to (at the very least) avoid broken nlink values
+ * -- though in the case of non-empty directories we will have to
+ * panic (otherwise we'd have a leaked directory with a broken ..).
+ */
+commit_unlink_td_szp:
+ VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
+commit_link_tzp:
+ if (tzp) {
+ if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
+ VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
+ }
+commit_link_szp:
+ if (zfs_link_create(sdl, szp, tx, ZRENAMING))
+ VERIFY0(zfs_drop_nlink(szp, tx, NULL));
+ goto commit;
}
/*
@@ -3002,6 +3215,7 @@ out:
* link - Name for new symlink entry.
* cr - credentials of caller.
* flags - case flags
+ * mnt_ns - user namespace of the mount
*
* OUT: zpp - Znode for new symbolic link.
*
@@ -3012,7 +3226,7 @@ out:
*/
int
zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
- znode_t **zpp, cred_t *cr, int flags)
+ znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
{
znode_t *zp;
zfs_dirlock_t *dl;
@@ -3032,26 +3246,26 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
if (name == NULL)
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(dzp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
+ return (error);
zilog = zfsvfs->z_log;
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EILSEQ));
}
if (flags & FIGNORECASE)
zflg |= ZCILOOK;
if (len > MAXPATHLEN) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENAMETOOLONG));
}
if ((error = zfs_acl_ids_create(dzp, 0,
- vap, cr, NULL, &acl_ids)) != 0) {
- ZFS_EXIT(zfsvfs);
+ vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
top:
@@ -3063,21 +3277,21 @@ top:
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
if (error) {
zfs_acl_ids_free(&acl_ids);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
- if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EDQUOT));
}
tx = dmu_tx_create(zfsvfs->z_os);
@@ -3104,7 +3318,7 @@ top:
}
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3159,7 +3373,7 @@ top:
zrele(zp);
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3185,8 +3399,8 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
zfsvfs_t *zfsvfs = ITOZSB(ip);
int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
mutex_enter(&zp->z_lock);
if (zp->z_is_sa)
@@ -3196,7 +3410,7 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
error = zfs_sa_readlink(zp, uio);
mutex_exit(&zp->z_lock);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3241,8 +3455,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
if (name == NULL)
return (SET_ERROR(EINVAL));
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(tdzp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
+ return (error);
zilog = zfsvfs->z_log;
/*
@@ -3250,11 +3464,14 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
* Better choices include ENOTSUP or EISDIR.
*/
if (S_ISDIR(sip->i_mode)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
}
- ZFS_VERIFY_ZP(szp);
+ if ((error = zfs_verify_zp(szp)) != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
/*
* If we are using project inheritance, means if the directory has
@@ -3265,7 +3482,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
*/
if (tdzp->z_pflags & ZFS_PROJINHERIT &&
tdzp->z_projid != szp->z_projid) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EXDEV));
}
@@ -3274,7 +3491,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
* super blocks.
*/
if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EXDEV));
}
@@ -3282,17 +3499,17 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
&parent, sizeof (uint64_t))) != 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
if (parent == zfsvfs->z_shares_dir) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
}
if (zfsvfs->z_utf8 && u8_validate(name,
strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EILSEQ));
}
if (flags & FIGNORECASE)
@@ -3305,19 +3522,20 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
* imposed in attribute space.
*/
if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
cr, ZFS_OWNER);
if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
}
- if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
- ZFS_EXIT(zfsvfs);
+ if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
+ zfs_init_idmap))) {
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3327,7 +3545,7 @@ top:
*/
error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
if (error) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3349,7 +3567,7 @@ top:
goto top;
}
dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
/* unmark z_unlinked so zfs_link_create will not reject */
@@ -3391,7 +3609,7 @@ top:
zfs_znode_update_vfs(tdzp);
zfs_znode_update_vfs(szp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3444,12 +3662,13 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
caddr_t va;
int err = 0;
uint64_t mtime[2], ctime[2];
+ inode_timespec_t tmp_ts;
sa_bulk_attr_t bulk[3];
int cnt = 0;
struct address_space *mapping;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (err);
ASSERT(PageLocked(pp));
@@ -3461,7 +3680,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
/* Page is beyond end of file */
if (pgoff >= offset) {
unlock_page(pp);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -3521,7 +3740,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
unlock_page(pp);
zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -3549,7 +3768,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
#endif
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -3557,7 +3776,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
if (!clear_page_dirty_for_io(pp)) {
unlock_page(pp);
zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -3576,11 +3795,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
- err = dmu_tx_assign(tx, TXG_NOWAIT);
+ err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
- if (err == ERESTART)
- dmu_tx_wait(tx);
-
dmu_tx_abort(tx);
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
@@ -3592,7 +3808,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
if (!for_sync)
atomic_dec_32(&zp->z_async_writes_cnt);
zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (err);
}
@@ -3607,28 +3823,23 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
&zp->z_pflags, 8);
/* Preserve the mtime and ctime provided by the inode */
- ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
- ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+ tmp_ts = zpl_inode_get_mtime(ip);
+ ZFS_TIME_ENCODE(&tmp_ts, mtime);
+ tmp_ts = zpl_inode_get_ctime(ip);
+ ZFS_TIME_ENCODE(&tmp_ts, ctime);
zp->z_atime_dirty = B_FALSE;
zp->z_seq++;
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
- for_sync ? zfs_putpage_sync_commit_cb :
- zfs_putpage_async_commit_cb, pp);
-
- dmu_tx_commit(tx);
-
- zfs_rangelock_exit(lr);
-
+ boolean_t commit = B_FALSE;
if (wbc->sync_mode != WB_SYNC_NONE) {
/*
* Note that this is rarely called under writepages(), because
* writepages() normally handles the entire commit for
* performance reasons.
*/
- zil_commit(zfsvfs->z_log, zp->z_id);
+ commit = B_TRUE;
} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
/*
* If the caller does not intend to wait synchronously
@@ -3638,12 +3849,23 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
* our writeback to complete. Refer to the comment in
* zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
*/
- zil_commit(zfsvfs->z_log, zp->z_id);
+ commit = B_TRUE;
}
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
+ for_sync ? zfs_putpage_sync_commit_cb :
+ zfs_putpage_async_commit_cb, pp);
+
+ dmu_tx_commit(tx);
+
+ zfs_rangelock_exit(lr);
+
+ if (commit)
+ zil_commit(zfsvfs->z_log, zp->z_id);
+
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (err);
}
@@ -3658,6 +3880,7 @@ zfs_dirty_inode(struct inode *ip, int flags)
zfsvfs_t *zfsvfs = ITOZSB(ip);
dmu_tx_t *tx;
uint64_t mode, atime[2], mtime[2], ctime[2];
+ inode_timespec_t tmp_ts;
sa_bulk_attr_t bulk[4];
int error = 0;
int cnt = 0;
@@ -3665,8 +3888,8 @@ zfs_dirty_inode(struct inode *ip, int flags)
if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
return (0);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
#ifdef I_DIRTY_TIME
/*
@@ -3702,9 +3925,12 @@ zfs_dirty_inode(struct inode *ip, int flags)
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
/* Preserve the mode, mtime and ctime provided by the inode */
- ZFS_TIME_ENCODE(&ip->i_atime, atime);
- ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
- ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+ tmp_ts = zpl_inode_get_atime(ip);
+ ZFS_TIME_ENCODE(&tmp_ts, atime);
+ tmp_ts = zpl_inode_get_mtime(ip);
+ ZFS_TIME_ENCODE(&tmp_ts, mtime);
+ tmp_ts = zpl_inode_get_ctime(ip);
+ ZFS_TIME_ENCODE(&tmp_ts, ctime);
mode = ip->i_mode;
zp->z_mode = mode;
@@ -3714,7 +3940,7 @@ zfs_dirty_inode(struct inode *ip, int flags)
dmu_tx_commit(tx);
out:
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3747,7 +3973,9 @@ zfs_inactive(struct inode *ip)
if (error) {
dmu_tx_abort(tx);
} else {
- ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ inode_timespec_t tmp_atime;
+ tmp_atime = zpl_inode_get_atime(ip);
+ ZFS_TIME_ENCODE(&tmp_atime, atime);
mutex_enter(&zp->z_lock);
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
(void *)&atime, sizeof (atime), tx);
@@ -3766,55 +3994,45 @@ zfs_inactive(struct inode *ip)
* Fill pages with data from the disk.
*/
static int
-zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
+zfs_fillpage(struct inode *ip, struct page *pp)
{
- znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
- objset_t *os;
- struct page *cur_pp;
- u_offset_t io_off, total;
- size_t io_len;
- loff_t i_size;
- unsigned page_idx;
- int err;
+ loff_t i_size = i_size_read(ip);
+ u_offset_t io_off = page_offset(pp);
+ size_t io_len = PAGE_SIZE;
- os = zfsvfs->z_os;
- io_len = nr_pages << PAGE_SHIFT;
- i_size = i_size_read(ip);
- io_off = page_offset(pl[0]);
+ ASSERT3U(io_off, <, i_size);
if (io_off + io_len > i_size)
io_len = i_size - io_off;
- /*
- * Iterate over list of pages and read each page individually.
- */
- page_idx = 0;
- for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
- caddr_t va;
+ void *va = kmap(pp);
+ int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
+ io_len, va, DMU_READ_PREFETCH);
+ if (io_len != PAGE_SIZE)
+ memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
+ kunmap(pp);
- cur_pp = pl[page_idx++];
- va = kmap(cur_pp);
- err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
- DMU_READ_PREFETCH);
- kunmap(cur_pp);
- if (err) {
- /* convert checksum errors into IO errors */
- if (err == ECKSUM)
- err = SET_ERROR(EIO);
- return (err);
- }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+
+ SetPageError(pp);
+ ClearPageUptodate(pp);
+ } else {
+ ClearPageError(pp);
+ SetPageUptodate(pp);
}
- return (0);
+ return (error);
}
/*
- * Uses zfs_fillpage to read data from the file and fill the pages.
+ * Uses zfs_fillpage to read data from the file and fill the page.
*
* IN: ip - inode of file to get data from.
- * pl - list of pages to read
- * nr_pages - number of pages to read
+ * pp - page to read
*
* RETURN: 0 on success, error code on failure.
*
@@ -3822,24 +4040,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
* vp - atime updated
*/
int
-zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
+zfs_getpage(struct inode *ip, struct page *pp)
{
- znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
- int err;
-
- if (pl == NULL)
- return (0);
+ znode_t *zp = ITOZ(ip);
+ int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
- err = zfs_fillpage(ip, pl, nr_pages);
+ error = zfs_fillpage(ip, pp);
+ if (error == 0)
+ dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
- dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
+ zfs_exit(zfsvfs, FTAG);
- ZFS_EXIT(zfsvfs);
- return (err);
+ return (error);
}
/*
@@ -3861,28 +4077,29 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
(void) addrp;
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
- if ((vm_flags & VM_WRITE) && (zp->z_pflags &
- (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
- ZFS_EXIT(zfsvfs);
+ if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
+ (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
}
if ((vm_flags & (VM_READ | VM_EXEC)) &&
(zp->z_pflags & ZFS_AV_QUARANTINED)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EACCES));
}
if (off < 0 || len > MAXOFFSET_T - off) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENXIO));
}
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -3913,11 +4130,11 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
uint64_t off, len;
int error;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
if (cmd != F_FREESP) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -3926,12 +4143,12 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
* so check it explicitly here.
*/
if (zfs_is_readonly(zfsvfs)) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EROFS));
}
if (bfp->l_len < 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -3941,8 +4158,9 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
* On Linux we can get here through truncate_range() which
* operates directly on inodes, so we need to check access rights.
*/
- if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
- ZFS_EXIT(zfsvfs);
+ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
+ zfs_init_idmap))) {
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3951,7 +4169,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
error = zfs_freesp(zp, off, len, flag, TRUE);
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3966,19 +4184,23 @@ zfs_fid(struct inode *ip, fid_t *fidp)
zfid_short_t *zfid;
int size, i, error;
- ZFS_ENTER(zfsvfs);
+ if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
+ return (error);
if (fidp->fid_len < SHORT_FID_LEN) {
fidp->fid_len = SHORT_FID_LEN;
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(ENOSPC));
}
- ZFS_VERIFY_ZP(zp);
+ if ((error = zfs_verify_zp(zp)) != 0) {
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
&gen64, sizeof (uint64_t))) != 0) {
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (error);
}
@@ -3999,7 +4221,7 @@ zfs_fid(struct inode *ip, fid_t *fidp)
for (i = 0; i < sizeof (zfid->zf_gen); i++)
zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
- ZFS_EXIT(zfsvfs);
+ zfs_exit(zfsvfs, FTAG);
return (0);
}
@@ -4030,5 +4252,4 @@ EXPORT_SYMBOL(zfs_map);
/* CSTYLED */
module_param(zfs_delete_blocks, ulong, 0644);
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
-
#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
index dc504b1a120b..b99df188c64b 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -169,8 +169,7 @@ zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
znode_hold_t *zh = buf;
mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
- zfs_refcount_create(&zh->zh_refcount);
- zh->zh_obj = ZFS_NO_OBJECT;
+ zh->zh_refcount = 0;
return (0);
}
@@ -182,7 +181,6 @@ zfs_znode_hold_cache_destructor(void *buf, void *arg)
znode_hold_t *zh = buf;
mutex_destroy(&zh->zh_lock);
- zfs_refcount_destroy(&zh->zh_refcount);
}
void
@@ -273,7 +271,7 @@ zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
return (held);
}
-static znode_hold_t *
+znode_hold_t *
zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
{
znode_hold_t *zh, *zh_new, search;
@@ -281,43 +279,43 @@ zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
boolean_t found = B_FALSE;
zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
- zh_new->zh_obj = obj;
search.zh_obj = obj;
mutex_enter(&zfsvfs->z_hold_locks[i]);
zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
if (likely(zh == NULL)) {
zh = zh_new;
+ zh->zh_obj = obj;
avl_add(&zfsvfs->z_hold_trees[i], zh);
} else {
ASSERT3U(zh->zh_obj, ==, obj);
found = B_TRUE;
}
- zfs_refcount_add(&zh->zh_refcount, NULL);
+ zh->zh_refcount++;
+ ASSERT3S(zh->zh_refcount, >, 0);
mutex_exit(&zfsvfs->z_hold_locks[i]);
if (found == B_TRUE)
kmem_cache_free(znode_hold_cache, zh_new);
ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
- ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
mutex_enter(&zh->zh_lock);
return (zh);
}
-static void
+void
zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
{
int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
boolean_t remove = B_FALSE;
ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
- ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
mutex_exit(&zh->zh_lock);
mutex_enter(&zfsvfs->z_hold_locks[i]);
- if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
+ ASSERT3S(zh->zh_refcount, >, 0);
+ if (--zh->zh_refcount == 0) {
avl_remove(&zfsvfs->z_hold_trees[i], zh);
remove = B_TRUE;
}
@@ -359,7 +357,7 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
void
zfs_znode_dmu_fini(znode_t *zp)
{
- ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
+ ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
sa_handle_destroy(zp->z_sa_hdl);
@@ -392,7 +390,6 @@ zfs_inode_destroy(struct inode *ip)
mutex_enter(&zfsvfs->z_znodes_lock);
if (list_link_active(&zp->z_link_node)) {
list_remove(&zfsvfs->z_all_znodes, zp);
- zfsvfs->z_nr_znodes--;
}
mutex_exit(&zfsvfs->z_znodes_lock);
@@ -417,12 +414,21 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
switch (ip->i_mode & S_IFMT) {
case S_IFREG:
ip->i_op = &zpl_inode_operations;
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+ ip->i_fop = &zpl_file_operations.kabi_fops;
+#else
ip->i_fop = &zpl_file_operations;
+#endif
ip->i_mapping->a_ops = &zpl_address_space_operations;
break;
case S_IFDIR:
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+ ip->i_flags |= S_IOPS_WRAPPER;
+ ip->i_op = &zpl_dir_inode_operations.ops;
+#else
ip->i_op = &zpl_dir_inode_operations;
+#endif
ip->i_fop = &zpl_dir_file_operations;
ITOZ(ip)->z_zn_prefetch = B_TRUE;
break;
@@ -452,7 +458,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
/* Assume the inode is a file and attempt to continue */
ip->i_mode = S_IFREG | 0644;
ip->i_op = &zpl_inode_operations;
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+ ip->i_fop = &zpl_file_operations.kabi_fops;
+#else
ip->i_fop = &zpl_file_operations;
+#endif
ip->i_mapping->a_ops = &zpl_address_space_operations;
break;
}
@@ -492,13 +502,11 @@ zfs_set_inode_flags(znode_t *zp, struct inode *ip)
void
zfs_znode_update_vfs(znode_t *zp)
{
- zfsvfs_t *zfsvfs;
struct inode *ip;
uint32_t blksize;
u_longlong_t i_blocks;
ASSERT(zp != NULL);
- zfsvfs = ZTOZSB(zp);
ip = ZTOI(zp);
/* Skip .zfs control nodes which do not exist on disk. */
@@ -534,6 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
uint64_t links;
uint64_t z_uid, z_gid;
uint64_t atime[2], mtime[2], ctime[2], btime[2];
+ inode_timespec_t tmp_ts;
uint64_t projid = ZFS_DEFAULT_PROJID;
sa_bulk_attr_t bulk[12];
int count = 0;
@@ -550,9 +559,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
ASSERT3P(zp->z_xattr_cached, ==, NULL);
zp->z_unlinked = B_FALSE;
zp->z_atime_dirty = B_FALSE;
+#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
zp->z_is_mapped = B_FALSE;
+#endif
zp->z_is_ctldir = B_FALSE;
- zp->z_is_stale = B_FALSE;
zp->z_suspended = B_FALSE;
zp->z_sa_hdl = NULL;
zp->z_mapcnt = 0;
@@ -604,9 +614,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
if (zp->z_pflags & ZFS_XATTR)
zp->z_xattr_parent = parent;
- ZFS_TIME_DECODE(&ip->i_atime, atime);
- ZFS_TIME_DECODE(&ip->i_mtime, mtime);
- ZFS_TIME_DECODE(&ip->i_ctime, ctime);
+ ZFS_TIME_DECODE(&tmp_ts, atime);
+ zpl_inode_set_atime_to_ts(ip, tmp_ts);
+ ZFS_TIME_DECODE(&tmp_ts, mtime);
+ zpl_inode_set_mtime_to_ts(ip, tmp_ts);
+ ZFS_TIME_DECODE(&tmp_ts, ctime);
+ zpl_inode_set_ctime_to_ts(ip, tmp_ts);
ZFS_TIME_DECODE(&zp->z_btime, btime);
ip->i_ino = zp->z_id;
@@ -631,7 +644,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
- zfsvfs->z_nr_znodes++;
mutex_exit(&zfsvfs->z_znodes_lock);
if (links > 0)
@@ -1187,6 +1199,7 @@ zfs_rezget(znode_t *zp)
uint64_t gen;
uint64_t z_uid, z_gid;
uint64_t atime[2], mtime[2], ctime[2], btime[2];
+ inode_timespec_t tmp_ts;
uint64_t projid = ZFS_DEFAULT_PROJID;
znode_hold_t *zh;
@@ -1279,9 +1292,12 @@ zfs_rezget(znode_t *zp)
zfs_uid_write(ZTOI(zp), z_uid);
zfs_gid_write(ZTOI(zp), z_gid);
- ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
- ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
- ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
+ ZFS_TIME_DECODE(&tmp_ts, atime);
+ zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
+ ZFS_TIME_DECODE(&tmp_ts, mtime);
+ zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
+ ZFS_TIME_DECODE(&tmp_ts, ctime);
+ zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
ZFS_TIME_DECODE(&zp->z_btime, btime);
if ((uint32_t)gen != ZTOI(zp)->i_generation) {
@@ -1389,21 +1405,24 @@ zfs_zinactive(znode_t *zp)
boolean_t
zfs_relatime_need_update(const struct inode *ip)
{
- inode_timespec_t now;
+ inode_timespec_t now, tmp_atime, tmp_ts;
gethrestime(&now);
+ tmp_atime = zpl_inode_get_atime(ip);
/*
* In relatime mode, only update the atime if the previous atime
* is earlier than either the ctime or mtime or if at least a day
* has passed since the last update of atime.
*/
- if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
+ tmp_ts = zpl_inode_get_mtime(ip);
+ if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
return (B_TRUE);
- if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
+ tmp_ts = zpl_inode_get_ctime(ip);
+ if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
return (B_TRUE);
- if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
+ if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
return (B_TRUE);
return (B_FALSE);
@@ -1426,7 +1445,7 @@ void
zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
uint64_t ctime[2])
{
- inode_timespec_t now;
+ inode_timespec_t now, tmp_ts;
gethrestime(&now);
@@ -1434,7 +1453,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
if (flag & ATTR_MTIME) {
ZFS_TIME_ENCODE(&now, mtime);
- ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
+ ZFS_TIME_DECODE(&tmp_ts, mtime);
+ zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
if (ZTOZSB(zp)->z_use_fuids) {
zp->z_pflags |= (ZFS_ARCHIVE |
ZFS_AV_MODIFIED);
@@ -1443,7 +1463,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
if (flag & ATTR_CTIME) {
ZFS_TIME_ENCODE(&now, ctime);
- ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
+ ZFS_TIME_DECODE(&tmp_ts, ctime);
+ zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
if (ZTOZSB(zp)->z_use_fuids)
zp->z_pflags |= ZFS_ARCHIVE;
}
@@ -1641,7 +1662,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
* Zero partial page cache entries. This must be done under a
* range lock in order to keep the ARC and page cache in sync.
*/
- if (zp->z_is_mapped) {
+ if (zn_has_cached_data(zp, off, off + len - 1)) {
loff_t first_page, last_page, page_len;
loff_t first_page_offset, last_page_offset;
@@ -1864,7 +1885,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
/* For the moment we expect all zpl props to be uint64_ts */
uint64_t val;
- char *name;
+ const char *name;
ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
VERIFY(nvpair_value_uint64(elem, &val) == 0);
@@ -1883,6 +1904,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
}
ASSERT(version != 0);
error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+ ASSERT(error == 0);
/*
* Create zap object used for SA attribute registration
@@ -1960,7 +1982,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
}
VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
- cr, NULL, &acl_ids));
+ cr, NULL, &acl_ids, zfs_init_idmap));
zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
ASSERT3P(zp, ==, rootzp);
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
@@ -2136,7 +2158,6 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
} else if (error != ENOENT) {
return (error);
}
- error = 0;
for (;;) {
uint64_t pobj = 0;
@@ -2252,6 +2273,91 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
return (error);
}
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION)
+ pname = ZPL_VERSION_STR;
+ else
+ pname = zfs_prop_to_name(prop);
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ case ZFS_PROP_ACLTYPE:
+ *value = ZFS_ACLTYPE_OFF;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
#if defined(_KERNEL)
EXPORT_SYMBOL(zfs_create_fs);
EXPORT_SYMBOL(zfs_obj_to_path);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
index dcab02b07894..21f3740f6fe6 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
@@ -223,14 +223,32 @@ int
zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
{
int ret;
- crypto_mechanism_t mech;
+ crypto_mechanism_t mech = {0};
uint_t keydata_len;
ASSERT(key != NULL);
ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+/*
+ * Workaround for GCC 12+ with UBSan enabled deficencies.
+ *
+ * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code
+ * below as violating -Warray-bounds
+ */
+#if defined(__GNUC__) && !defined(__clang__) && \
+ ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
+ defined(CONFIG_UBSAN))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
keydata_len = zio_crypt_table[crypt].ci_keylen;
+#if defined(__GNUC__) && !defined(__clang__) && \
+ ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
+ defined(CONFIG_UBSAN))
+#pragma GCC diagnostic pop
+#endif
memset(key, 0, sizeof (zio_crypt_key_t));
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
/* fill keydata buffers and salt with random data */
ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
@@ -282,7 +300,6 @@ zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
key->zk_crypt = crypt;
key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
key->zk_salt_count = 0;
- rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
return (0);
@@ -1388,7 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
boolean_t *no_crypt)
{
int ret;
- uint64_t txtype, lr_len;
+ uint64_t txtype, lr_len, nused;
uint_t nr_src, nr_dst, crypt_len;
uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
@@ -1415,7 +1432,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
zilc = (zil_chain_t *)src;
slrp = src + sizeof (zil_chain_t);
aadp = aadbuf;
- blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+ nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+ ASSERT3U(nused, >=, sizeof (zil_chain_t));
+ ASSERT3U(nused, <=, datalen);
+ blkend = src + nused;
/* calculate the number of encrypted iovecs we will need */
for (; slrp < blkend; slrp += lr_len) {
@@ -1428,6 +1448,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
txtype = BSWAP_64(lr->lrc_txtype);
lr_len = BSWAP_64(lr->lrc_reclen);
}
+ ASSERT3U(lr_len, >=, sizeof (lr_t));
+ ASSERT3U(lr_len, <=, blkend - slrp);
nr_iovecs++;
if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
@@ -1496,20 +1518,16 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
* authenticate it.
*/
if (txtype == TX_WRITE) {
- crypt_len = sizeof (lr_write_t) -
- sizeof (lr_t) - sizeof (blkptr_t);
+ const size_t o = offsetof(lr_write_t, lr_blkptr);
+ crypt_len = o - sizeof (lr_t);
src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
src_iovecs[nr_iovecs].iov_len = crypt_len;
dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
dst_iovecs[nr_iovecs].iov_len = crypt_len;
/* copy the bp now since it will not be encrypted */
- memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
- slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
- sizeof (blkptr_t));
- memcpy(aadp,
- slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
- sizeof (blkptr_t));
+ memcpy(dlrp + o, slrp + o, sizeof (blkptr_t));
+ memcpy(aadp, slrp + o, sizeof (blkptr_t));
aadp += sizeof (blkptr_t);
aad_len += sizeof (blkptr_t);
nr_iovecs++;
@@ -1526,6 +1544,21 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
nr_iovecs++;
total_len += crypt_len;
}
+ } else if (txtype == TX_CLONE_RANGE) {
+ const size_t o = offsetof(lr_clone_range_t, lr_nbps);
+ crypt_len = o - sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ /* copy the bps now since they will not be encrypted */
+ memcpy(dlrp + o, slrp + o, lr_len - o);
+ memcpy(aadp, slrp + o, lr_len - o);
+ aadp += lr_len - o;
+ aad_len += lr_len - o;
+ nr_iovecs++;
+ total_len += crypt_len;
} else {
crypt_len = lr_len - sizeof (lr_t);
src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
@@ -1891,6 +1924,9 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
crypto_ctx_template_t tmpl;
uint8_t *authbuf = NULL;
+ memset(&puio, 0, sizeof (puio));
+ memset(&cuio, 0, sizeof (cuio));
+
/*
* If the needed key is the current one, just use it. Otherwise we
* need to generate a temporary one from the given salt + master key.
@@ -1950,9 +1986,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
/* If the hardware implementation fails fall back to software */
}
- memset(&puio, 0, sizeof (puio));
- memset(&cuio, 0, sizeof (cuio));
-
/* create uios for encryption */
ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
@@ -1968,7 +2001,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
if (locked) {
rw_exit(&key->zk_salt_lock);
- locked = B_FALSE;
}
if (authbuf != NULL)
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index d5c222120a9d..8ee7fcecc7b7 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -42,7 +42,7 @@
static int
zpl_common_open(struct inode *ip, struct file *filp)
{
- if (filp->f_mode & FMODE_WRITE)
+ if (blk_mode_is_open_write(filp->f_mode))
return (-EACCES);
return (generic_file_open(ip, filp));
@@ -57,7 +57,8 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
int error = 0;
- ZPL_ENTER(zfsvfs);
+ if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
+ return (error);
if (!zpl_dir_emit_dots(filp, ctx))
goto out;
@@ -78,7 +79,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
ctx->pos++;
}
out:
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
return (error);
}
@@ -102,7 +103,11 @@ zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
* Get root directory attributes.
*/
static int
-#ifdef HAVE_USERNS_IOPS_GETATTR
+#ifdef HAVE_IDMAP_IOPS_GETATTR
+zpl_root_getattr_impl(struct mnt_idmap *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+#elif defined(HAVE_USERNS_IOPS_GETATTR)
zpl_root_getattr_impl(struct user_namespace *user_ns,
const struct path *path, struct kstat *stat, u32 request_mask,
unsigned int query_flags)
@@ -114,9 +119,13 @@ zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
(void) request_mask, (void) query_flags;
struct inode *ip = path->dentry->d_inode;
-#ifdef HAVE_USERNS_IOPS_GETATTR
+#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
#ifdef HAVE_GENERIC_FILLATTR_USERNS
generic_fillattr(user_ns, ip, stat);
+#elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
+ generic_fillattr(user_ns, ip, stat);
+#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
+ generic_fillattr(user_ns, request_mask, ip, stat);
#else
(void) user_ns;
#endif
@@ -207,7 +216,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
return (!!dentry->d_inode);
}
-static const dentry_operations_t zpl_dops_snapdirs = {
+static dentry_operations_t zpl_dops_snapdirs = {
/*
* Auto mounting of snapshots is only supported for 2.6.37 and
* newer kernels. Prior to this kernel the ops->follow_link()
@@ -258,7 +267,8 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
uint64_t id, pos;
int error = 0;
- ZPL_ENTER(zfsvfs);
+ if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
+ return (error);
cookie = spl_fstrans_mark();
if (!zpl_dir_emit_dots(filp, ctx))
@@ -282,7 +292,7 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
}
out:
spl_fstrans_unmark(cookie);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
if (error == -ENOENT)
return (0);
@@ -310,6 +320,10 @@ static int
zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip,
struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
unsigned int flags)
+#elif defined(HAVE_IOPS_RENAME_IDMAP)
+zpl_snapdir_rename2(struct mnt_idmap *user_ns, struct inode *sdip,
+ struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
+ unsigned int flags)
#else
zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
struct inode *tdip, struct dentry *tdentry, unsigned int flags)
@@ -331,7 +345,9 @@ zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
return (error);
}
-#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS)
+#if (!defined(HAVE_RENAME_WANTS_FLAGS) && \
+ !defined(HAVE_IOPS_RENAME_USERNS) && \
+ !defined(HAVE_IOPS_RENAME_IDMAP))
static int
zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
struct inode *tdip, struct dentry *tdentry)
@@ -358,6 +374,9 @@ static int
#ifdef HAVE_IOPS_MKDIR_USERNS
zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip,
struct dentry *dentry, umode_t mode)
+#elif defined(HAVE_IOPS_MKDIR_IDMAP)
+zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip,
+ struct dentry *dentry, umode_t mode)
#else
zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
#endif
@@ -369,7 +388,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
+#if (defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP))
+ zpl_vap_init(vap, dip, mode | S_IFDIR, cr, user_ns);
+#else
+ zpl_vap_init(vap, dip, mode | S_IFDIR, cr, zfs_init_idmap);
+#endif
error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
if (error == 0) {
@@ -389,7 +412,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
* Get snapshot directory attributes.
*/
static int
-#ifdef HAVE_USERNS_IOPS_GETATTR
+#ifdef HAVE_IDMAP_IOPS_GETATTR
+zpl_snapdir_getattr_impl(struct mnt_idmap *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+#elif defined(HAVE_USERNS_IOPS_GETATTR)
zpl_snapdir_getattr_impl(struct user_namespace *user_ns,
const struct path *path, struct kstat *stat, u32 request_mask,
unsigned int query_flags)
@@ -401,11 +428,17 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
(void) request_mask, (void) query_flags;
struct inode *ip = path->dentry->d_inode;
zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
- ZPL_ENTER(zfsvfs);
-#ifdef HAVE_USERNS_IOPS_GETATTR
+ if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
+ return (error);
+#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
#ifdef HAVE_GENERIC_FILLATTR_USERNS
generic_fillattr(user_ns, ip, stat);
+#elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
+ generic_fillattr(user_ns, ip, stat);
+#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
+ generic_fillattr(user_ns, request_mask, ip, stat);
#else
(void) user_ns;
#endif
@@ -422,7 +455,7 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
if (err != 0) {
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
return (-err);
}
stat->nlink += snap_count;
@@ -430,7 +463,7 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
stat->atime = current_time(ip);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
return (0);
}
@@ -463,7 +496,9 @@ const struct file_operations zpl_fops_snapdir = {
const struct inode_operations zpl_ops_snapdir = {
.lookup = zpl_snapdir_lookup,
.getattr = zpl_snapdir_getattr,
-#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
+#if (defined(HAVE_RENAME_WANTS_FLAGS) || \
+ defined(HAVE_IOPS_RENAME_USERNS) || \
+ defined(HAVE_IOPS_RENAME_IDMAP))
.rename = zpl_snapdir_rename2,
#else
.rename = zpl_snapdir_rename,
@@ -508,7 +543,8 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
znode_t *dzp;
int error = 0;
- ZPL_ENTER(zfsvfs);
+ if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
+ return (error);
cookie = spl_fstrans_mark();
if (zfsvfs->z_shares_dir == 0) {
@@ -527,7 +563,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
iput(ZTOI(dzp));
out:
spl_fstrans_unmark(cookie);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
ASSERT3S(error, <=, 0);
return (error);
@@ -553,6 +589,10 @@ static int
zpl_shares_getattr_impl(struct user_namespace *user_ns,
const struct path *path, struct kstat *stat, u32 request_mask,
unsigned int query_flags)
+#elif defined(HAVE_IDMAP_IOPS_GETATTR)
+zpl_shares_getattr_impl(struct mnt_idmap *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
#else
zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
@@ -564,12 +604,17 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
znode_t *dzp;
int error;
- ZPL_ENTER(zfsvfs);
+ if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
+ return (error);
if (zfsvfs->z_shares_dir == 0) {
-#ifdef HAVE_USERNS_IOPS_GETATTR
+#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
#ifdef HAVE_GENERIC_FILLATTR_USERNS
generic_fillattr(user_ns, path->dentry->d_inode, stat);
+#elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
+ generic_fillattr(user_ns, path->dentry->d_inode, stat);
+#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
+ generic_fillattr(user_ns, request_mask, ip, stat);
#else
(void) user_ns;
#endif
@@ -578,25 +623,24 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
#endif
stat->nlink = stat->size = 2;
stat->atime = current_time(ip);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
return (0);
}
error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
if (error == 0) {
-#ifdef HAVE_USERNS_IOPS_GETATTR
-#ifdef HAVE_GENERIC_FILLATTR_USERNS
+#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
+ error = -zfs_getattr_fast(user_ns, request_mask, ZTOI(dzp),
+ stat);
+#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat);
#else
- (void) user_ns;
-#endif
-#else
error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat);
#endif
iput(ZTOI(dzp));
}
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
ASSERT3S(error, <=, 0);
return (error);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
index 5be63532d329..aa80b72e2d7a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index 9a640fb40b67..9dec52215c7c 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -27,6 +27,7 @@
#ifdef CONFIG_COMPAT
#include <linux/compat.h>
#endif
+#include <linux/fs.h>
#include <sys/file.h>
#include <sys/dmu_objset.h>
#include <sys/zfs_znode.h>
@@ -37,6 +38,9 @@
defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
#include <linux/pagemap.h>
#endif
+#ifdef HAVE_FILE_FADVISE
+#include <linux/fadvise.h>
+#endif
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
#include <linux/writeback.h>
#endif
@@ -191,9 +195,12 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
* zfs_putpage() respectively.
*/
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
- ZPL_ENTER(zfsvfs);
+ if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
+ atomic_dec_32(&zp->z_sync_writes_cnt);
+ return (error);
+ }
zil_commit(zfsvfs->z_log, zp->z_id);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
}
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
@@ -294,16 +301,11 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
#if defined(HAVE_VFS_IOV_ITER)
zfs_uio_iov_iter_init(uio, to, pos, count, skip);
#else
-#ifdef HAVE_IOV_ITER_TYPE
- zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
- iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
- count, skip);
-#else
- zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
- to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
+ zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
+ zfs_uio_iov_iter_type(to) & ITER_KVEC ?
+ UIO_SYSSPACE : UIO_USERSPACE,
count, skip);
#endif
-#endif
}
static ssize_t
@@ -618,7 +620,6 @@ static int
zpl_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct inode *ip = filp->f_mapping->host;
- znode_t *zp = ITOZ(ip);
int error;
fstrans_cookie_t cookie;
@@ -633,9 +634,12 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
if (error)
return (error);
+#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
+ znode_t *zp = ITOZ(ip);
mutex_enter(&zp->z_lock);
zp->z_is_mapped = B_TRUE;
mutex_exit(&zp->z_lock);
+#endif
return (error);
}
@@ -648,29 +652,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
static inline int
zpl_readpage_common(struct page *pp)
{
- struct inode *ip;
- struct page *pl[1];
- int error = 0;
fstrans_cookie_t cookie;
ASSERT(PageLocked(pp));
- ip = pp->mapping->host;
- pl[0] = pp;
cookie = spl_fstrans_mark();
- error = -zfs_getpage(ip, pl, 1);
+ int error = -zfs_getpage(pp->mapping->host, pp);
spl_fstrans_unmark(cookie);
- if (error) {
- SetPageError(pp);
- ClearPageUptodate(pp);
- } else {
- ClearPageError(pp);
- SetPageUptodate(pp);
- flush_dcache_page(pp);
- }
-
unlock_page(pp);
+
return (error);
}
@@ -729,15 +720,38 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
{
boolean_t *for_sync = data;
fstrans_cookie_t cookie;
+ int ret;
ASSERT(PageLocked(pp));
ASSERT(!PageWriteback(pp));
cookie = spl_fstrans_mark();
- (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
+ ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
spl_fstrans_unmark(cookie);
- return (0);
+ return (ret);
+}
+
+#ifdef HAVE_WRITEPAGE_T_FOLIO
+static int
+zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
+{
+ return (zpl_putpage(&pp->page, wbc, data));
+}
+#endif
+
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+ struct writeback_control *wbc, void *data)
+{
+ int result;
+
+#ifdef HAVE_WRITEPAGE_T_FOLIO
+ result = write_cache_pages(mapping, wbc, zpl_putfolio, data);
+#else
+ result = write_cache_pages(mapping, wbc, zpl_putpage, data);
+#endif
+ return (result);
}
static int
@@ -748,10 +762,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
enum writeback_sync_modes sync_mode;
int result;
- ZPL_ENTER(zfsvfs);
+ if ((result = zpl_enter(zfsvfs, FTAG)) != 0)
+ return (result);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
wbc->sync_mode = WB_SYNC_ALL;
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
sync_mode = wbc->sync_mode;
/*
@@ -763,13 +778,13 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
*/
boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
wbc->sync_mode = WB_SYNC_NONE;
- result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync);
+ result = zpl_write_cache_pages(mapping, wbc, &for_sync);
if (sync_mode != wbc->sync_mode) {
- ZPL_ENTER(zfsvfs);
- ZPL_VERIFY_ZP(zp);
+ if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (result);
if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, zp->z_id);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
/*
* We need to call write_cache_pages() again (we can't just
@@ -779,8 +794,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
* details). That being said, this is a no-op in most cases.
*/
wbc->sync_mode = sync_mode;
- result = write_cache_pages(mapping, wbc, zpl_putpage,
- &for_sync);
+ result = zpl_write_cache_pages(mapping, wbc, &for_sync);
}
return (result);
}
@@ -906,6 +920,61 @@ zpl_ioctl_getversion(struct file *filp, void __user *arg)
return (copy_to_user(arg, &generation, sizeof (generation)));
}
+#ifdef HAVE_FILE_FADVISE
+static int
+zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
+{
+ struct inode *ip = file_inode(filp);
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ objset_t *os = zfsvfs->z_os;
+ int error = 0;
+
+ if (S_ISFIFO(ip->i_mode))
+ return (-ESPIPE);
+
+ if (offset < 0 || len < 0)
+ return (-EINVAL);
+
+ if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ return (error);
+
+ switch (advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_WILLNEED:
+#ifdef HAVE_GENERIC_FADVISE
+ if (zn_has_cached_data(zp, offset, offset + len - 1))
+ error = generic_fadvise(filp, offset, len, advice);
+#endif
+ /*
+ * Pass on the caller's size directly, but note that
+ * dmu_prefetch_max will effectively cap it. If there
+ * really is a larger sequential access pattern, perhaps
+ * dmu_zfetch will detect it.
+ */
+ if (len == 0)
+ len = i_size_read(ip) - offset;
+
+ dmu_prefetch(os, zp->z_id, 0, offset, len,
+ ZIO_PRIORITY_ASYNC_READ);
+ break;
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_DONTNEED:
+ case POSIX_FADV_NOREUSE:
+ /* ignored for now */
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ zfs_exit(zfsvfs, FTAG);
+
+ return (error);
+}
+#endif /* HAVE_FILE_FADVISE */
+
#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
@@ -975,7 +1044,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
!capable(CAP_LINUX_IMMUTABLE))
return (-EPERM);
- if (!zpl_inode_owner_or_capable(kcred->user_ns, ip))
+ if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
return (-EACCES);
xva_init(xva);
@@ -1022,7 +1091,7 @@ zpl_ioctl_setflags(struct file *filp, void __user *arg)
crhold(cr);
cookie = spl_fstrans_mark();
- err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+ err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
spl_fstrans_unmark(cookie);
crfree(cr);
@@ -1070,7 +1139,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg)
crhold(cr);
cookie = spl_fstrans_mark();
- err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+ err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
spl_fstrans_unmark(cookie);
crfree(cr);
@@ -1105,7 +1174,7 @@ __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva)
!capable(CAP_LINUX_IMMUTABLE))
return (-EPERM);
- if (!zpl_inode_owner_or_capable(kcred->user_ns, ip))
+ if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
return (-EACCES);
xva_init(xva);
@@ -1158,7 +1227,7 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
crhold(cr);
cookie = spl_fstrans_mark();
- err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+ err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
spl_fstrans_unmark(cookie);
crfree(cr);
@@ -1183,6 +1252,12 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return (zpl_ioctl_getdosflags(filp, (void *)arg));
case ZFS_IOC_SETDOSFLAGS:
return (zpl_ioctl_setdosflags(filp, (void *)arg));
+ case ZFS_IOC_COMPAT_FICLONE:
+ return (zpl_ioctl_ficlone(filp, (void *)arg));
+ case ZFS_IOC_COMPAT_FICLONERANGE:
+ return (zpl_ioctl_ficlonerange(filp, (void *)arg));
+ case ZFS_IOC_COMPAT_FIDEDUPERANGE:
+ return (zpl_ioctl_fideduperange(filp, (void *)arg));
default:
return (-ENOTTY);
}
@@ -1209,7 +1284,6 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
#endif /* CONFIG_COMPAT */
-
const struct address_space_operations zpl_address_space_operations = {
#ifdef HAVE_VFS_READPAGES
.readpages = zpl_readpages,
@@ -1232,7 +1306,12 @@ const struct address_space_operations zpl_address_space_operations = {
#endif
};
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+const struct file_operations_extend zpl_file_operations = {
+ .kabi_fops = {
+#else
const struct file_operations zpl_file_operations = {
+#endif
.open = zpl_open,
.release = zpl_release,
.llseek = zpl_llseek,
@@ -1244,7 +1323,11 @@ const struct file_operations zpl_file_operations = {
.read_iter = zpl_iter_read,
.write_iter = zpl_iter_write,
#ifdef HAVE_VFS_IOV_ITER
+#ifdef HAVE_COPY_SPLICE_READ
+ .splice_read = copy_splice_read,
+#else
.splice_read = generic_file_splice_read,
+#endif
.splice_write = iter_file_splice_write,
#endif
#else
@@ -1259,10 +1342,30 @@ const struct file_operations zpl_file_operations = {
.aio_fsync = zpl_aio_fsync,
#endif
.fallocate = zpl_fallocate,
+#ifdef HAVE_VFS_COPY_FILE_RANGE
+ .copy_file_range = zpl_copy_file_range,
+#endif
+#ifdef HAVE_VFS_CLONE_FILE_RANGE
+ .clone_file_range = zpl_clone_file_range,
+#endif
+#ifdef HAVE_VFS_REMAP_FILE_RANGE
+ .remap_file_range = zpl_remap_file_range,
+#endif
+#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
+ .dedupe_file_range = zpl_dedupe_file_range,
+#endif
+#ifdef HAVE_FILE_FADVISE
+ .fadvise = zpl_fadvise,
+#endif
.unlocked_ioctl = zpl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = zpl_compat_ioctl,
#endif
+#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
+ }, /* kabi_fops */
+ .copy_file_range = zpl_copy_file_range,
+ .clone_file_range = zpl_clone_file_range,
+#endif
};
const struct file_operations zpl_dir_file_operations = {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c
new file mode 100644
index 000000000000..64728fdb1187
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c
@@ -0,0 +1,299 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <linux/fs.h>
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
+#include <linux/splice.h>
+#endif
+#include <sys/file.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfeature.h>
+
+/*
+ * Clone part of a file via block cloning.
+ *
+ * Note that we are not required to update file offsets; the kernel will take
+ * care of that depending on how it was called.
+ */
+static ssize_t
+zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, size_t len)
+{
+ struct inode *src_i = file_inode(src_file);
+ struct inode *dst_i = file_inode(dst_file);
+ uint64_t src_off_o = (uint64_t)src_off;
+ uint64_t dst_off_o = (uint64_t)dst_off;
+ uint64_t len_o = (uint64_t)len;
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int err;
+
+ if (!zfs_bclone_enabled)
+ return (-EOPNOTSUPP);
+
+ if (!spa_feature_is_enabled(
+ dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING))
+ return (-EOPNOTSUPP);
+
+ if (src_i != dst_i)
+ spl_inode_lock_shared(src_i);
+ spl_inode_lock(dst_i);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i),
+ &dst_off_o, &len_o, cr);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ spl_inode_unlock(dst_i);
+ if (src_i != dst_i)
+ spl_inode_unlock_shared(src_i);
+
+ if (err < 0)
+ return (err);
+
+ return ((ssize_t)len_o);
+}
+
+#if defined(HAVE_VFS_COPY_FILE_RANGE) || \
+ defined(HAVE_VFS_FILE_OPERATIONS_EXTEND)
+/*
+ * Entry point for copy_file_range(). Copy len bytes from src_off in src_file
+ * to dst_off in dst_file. We are permitted to do this however we like, so we
+ * try to just clone the blocks, and if we can't support it, fall back to the
+ * kernel's generic byte copy function.
+ */
+ssize_t
+zpl_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags)
+{
+ ssize_t ret;
+
+ /* Flags is reserved for future extensions and must be zero. */
+ if (flags != 0)
+ return (-EINVAL);
+
+ /* Try to do it via zfs_clone_range() and allow shortening. */
+ ret = zpl_clone_file_range_impl(src_file, src_off,
+ dst_file, dst_off, len);
+
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
+ /*
+ * Since Linux 5.3 the filesystem driver is responsible for executing
+ * an appropriate fallback, and a generic fallback function is provided.
+ */
+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+ ret == -EAGAIN)
+ ret = generic_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len, flags);
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
+ /*
+ * Since 6.8 the fallback function is called splice_copy_file_range
+ * and has a slightly different signature.
+ */
+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+ ret == -EAGAIN)
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len);
+#else
+ /*
+ * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
+ * to the kernel that it should fallback to a content copy.
+ */
+ if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
+ ret = -EOPNOTSUPP;
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
+
+ return (ret);
+}
+#endif /* HAVE_VFS_COPY_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
+
+#ifdef HAVE_VFS_REMAP_FILE_RANGE
+/*
+ * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE.
+ *
+ * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except
+ * that they must clone - they cannot fall back to copying. FICLONE is exactly
+ * FICLONERANGE, for the entire file. We don't need to try to tell them apart;
+ * the kernel will sort that out for us.
+ *
+ * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
+ * range in both files and if they're the same, arrange for them to be backed
+ * by the same storage.
+ *
+ * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
+ * if we want. It's designed for filesystems that may need to shorten the
+ * length for alignment, EOF, or any other requirement. ZFS may shorten the
+ * request when there is outstanding dirty data which hasn't been written.
+ */
+loff_t
+zpl_remap_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags)
+{
+ if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
+ return (-EINVAL);
+
+ /* No support for dedup yet */
+ if (flags & REMAP_FILE_DEDUP)
+ return (-EOPNOTSUPP);
+
+ /* Zero length means to clone everything to the end of the file */
+ if (len == 0)
+ len = i_size_read(file_inode(src_file)) - src_off;
+
+ ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+ dst_file, dst_off, len);
+
+ if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
+ ret = -EINVAL;
+
+ return (ret);
+}
+#endif /* HAVE_VFS_REMAP_FILE_RANGE */
+
+#if defined(HAVE_VFS_CLONE_FILE_RANGE) || \
+ defined(HAVE_VFS_FILE_OPERATIONS_EXTEND)
+/*
+ * Entry point for FICLONE and FICLONERANGE, before Linux 4.20.
+ */
+int
+zpl_clone_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, uint64_t len)
+{
+ /* Zero length means to clone everything to the end of the file */
+ if (len == 0)
+ len = i_size_read(file_inode(src_file)) - src_off;
+
+ /* The entire length must be cloned or this is an error. */
+ ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
+ dst_file, dst_off, len);
+
+ if (ret >= 0 && ret != len)
+ ret = -EINVAL;
+
+ return (ret);
+}
+#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
+
+#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
+/*
+ * Entry point for FIDEDUPERANGE, before Linux 4.20.
+ */
+int
+zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, uint64_t len)
+{
+ /* No support for dedup yet */
+ return (-EOPNOTSUPP);
+}
+#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */
+
+/* Entry point for FICLONE, before Linux 4.5. */
+long
+zpl_ioctl_ficlone(struct file *dst_file, void *arg)
+{
+ unsigned long sfd = (unsigned long)arg;
+
+ struct file *src_file = fget(sfd);
+ if (src_file == NULL)
+ return (-EBADF);
+
+ if (dst_file->f_op != src_file->f_op) {
+ fput(src_file);
+ return (-EXDEV);
+ }
+
+ size_t len = i_size_read(file_inode(src_file));
+
+ ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
+
+ fput(src_file);
+
+ if (ret < 0) {
+ if (ret == -EOPNOTSUPP)
+ return (-ENOTTY);
+ return (ret);
+ }
+
+ if (ret != len)
+ return (-EINVAL);
+
+ return (0);
+}
+
+/* Entry point for FICLONERANGE, before Linux 4.5. */
+long
+zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
+{
+ zfs_ioc_compat_file_clone_range_t fcr;
+
+ if (copy_from_user(&fcr, arg, sizeof (fcr)))
+ return (-EFAULT);
+
+ struct file *src_file = fget(fcr.fcr_src_fd);
+ if (src_file == NULL)
+ return (-EBADF);
+
+ if (dst_file->f_op != src_file->f_op) {
+ fput(src_file);
+ return (-EXDEV);
+ }
+
+ size_t len = fcr.fcr_src_length;
+ if (len == 0)
+ len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
+
+ ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
+ dst_file, fcr.fcr_dest_offset, len);
+
+ fput(src_file);
+
+ if (ret < 0) {
+ if (ret == -EOPNOTSUPP)
+ return (-ENOTTY);
+ return (ret);
+ }
+
+ if (ret != len)
+ return (-EINVAL);
+
+ return (0);
+}
+
+/* Entry point for FIDEDUPERANGE, before Linux 4.5. */
+long
+zpl_ioctl_fideduperange(struct file *filp, void *arg)
+{
+ (void) arg;
+
+ /* No support for dedup yet */
+ return (-ENOTTY);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
index 4f79265a0856..ad1753f7a071 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -24,6 +24,7 @@
*/
+#include <sys/sysmacros.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_vfsops.h>
#include <sys/zfs_vnops.h>
@@ -33,7 +34,6 @@
#include <sys/zpl.h>
#include <sys/file.h>
-
static struct dentry *
zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
@@ -112,18 +112,22 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
}
void
-zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr)
+zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr,
+ zidmap_t *mnt_ns)
{
vap->va_mask = ATTR_MODE;
vap->va_mode = mode;
- vap->va_uid = crgetuid(cr);
- if (dir && dir->i_mode & S_ISGID) {
+ vap->va_uid = zfs_vfsuid_to_uid(mnt_ns,
+ zfs_i_user_ns(dir), crgetuid(cr));
+
+ if (dir->i_mode & S_ISGID) {
vap->va_gid = KGID_TO_SGID(dir->i_gid);
if (S_ISDIR(mode))
vap->va_mode |= S_ISGID;
} else {
- vap->va_gid = crgetgid(cr);
+ vap->va_gid = zfs_vfsgid_to_gid(mnt_ns,
+ zfs_i_user_ns(dir), crgetgid(cr));
}
}
@@ -131,6 +135,9 @@ static int
#ifdef HAVE_IOPS_CREATE_USERNS
zpl_create(struct user_namespace *user_ns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool flag)
+#elif defined(HAVE_IOPS_CREATE_IDMAP)
+zpl_create(struct mnt_idmap *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool flag)
#else
zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag)
#endif
@@ -140,14 +147,17 @@ zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag)
vattr_t *vap;
int error;
fstrans_cookie_t cookie;
+#if !(defined(HAVE_IOPS_CREATE_USERNS) || defined(HAVE_IOPS_CREATE_IDMAP))
+ zidmap_t *user_ns = kcred->user_ns;
+#endif
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, mode, cr);
+ zpl_vap_init(vap, dir, mode, cr, user_ns);
cookie = spl_fstrans_mark();
error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
- mode, &zp, cr, 0, NULL);
+ mode, &zp, cr, 0, NULL, user_ns);
if (error == 0) {
error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
if (error == 0)
@@ -174,6 +184,9 @@ static int
#ifdef HAVE_IOPS_MKNOD_USERNS
zpl_mknod(struct user_namespace *user_ns, struct inode *dir,
struct dentry *dentry, umode_t mode,
+#elif defined(HAVE_IOPS_MKNOD_IDMAP)
+zpl_mknod(struct mnt_idmap *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode,
#else
zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
#endif
@@ -184,6 +197,9 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
vattr_t *vap;
int error;
fstrans_cookie_t cookie;
+#if !(defined(HAVE_IOPS_MKNOD_USERNS) || defined(HAVE_IOPS_MKNOD_IDMAP))
+ zidmap_t *user_ns = kcred->user_ns;
+#endif
/*
* We currently expect Linux to supply rdev=0 for all sockets
@@ -194,12 +210,12 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, mode, cr);
+ zpl_vap_init(vap, dir, mode, cr, user_ns);
vap->va_rdev = rdev;
cookie = spl_fstrans_mark();
error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
- mode, &zp, cr, 0, NULL);
+ mode, &zp, cr, 0, NULL, user_ns);
if (error == 0) {
error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
if (error == 0)
@@ -224,18 +240,29 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
#ifdef HAVE_TMPFILE
static int
+#ifdef HAVE_TMPFILE_IDMAP
+zpl_tmpfile(struct mnt_idmap *userns, struct inode *dir,
+ struct file *file, umode_t mode)
+#elif !defined(HAVE_TMPFILE_DENTRY)
+zpl_tmpfile(struct user_namespace *userns, struct inode *dir,
+ struct file *file, umode_t mode)
+#else
#ifdef HAVE_TMPFILE_USERNS
zpl_tmpfile(struct user_namespace *userns, struct inode *dir,
struct dentry *dentry, umode_t mode)
#else
zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
#endif
+#endif
{
cred_t *cr = CRED();
struct inode *ip;
vattr_t *vap;
int error;
fstrans_cookie_t cookie;
+#if !(defined(HAVE_TMPFILE_USERNS) || defined(HAVE_TMPFILE_IDMAP))
+ zidmap_t *userns = kcred->user_ns;
+#endif
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
@@ -245,18 +272,28 @@ zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
*/
if (!IS_POSIXACL(dir))
mode &= ~current_umask();
- zpl_vap_init(vap, dir, mode, cr);
+ zpl_vap_init(vap, dir, mode, cr, userns);
cookie = spl_fstrans_mark();
- error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL);
+ error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL, userns);
if (error == 0) {
/* d_tmpfile will do drop_nlink, so we should set it first */
set_nlink(ip, 1);
+#ifndef HAVE_TMPFILE_DENTRY
+ d_tmpfile(file, ip);
+
+ error = zpl_xattr_security_init(ip, dir,
+ &file->f_path.dentry->d_name);
+#else
d_tmpfile(dentry, ip);
error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+#endif
if (error == 0)
error = zpl_init_acl(ip, dir);
+#ifndef HAVE_TMPFILE_DENTRY
+ error = finish_open_simple(file, error);
+#endif
/*
* don't need to handle error here, file is already in
* unlinked set.
@@ -302,6 +339,9 @@ static int
#ifdef HAVE_IOPS_MKDIR_USERNS
zpl_mkdir(struct user_namespace *user_ns, struct inode *dir,
struct dentry *dentry, umode_t mode)
+#elif defined(HAVE_IOPS_MKDIR_IDMAP)
+zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
#else
zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
#endif
@@ -311,13 +351,17 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
znode_t *zp;
int error;
fstrans_cookie_t cookie;
+#if !(defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP))
+ zidmap_t *user_ns = kcred->user_ns;
+#endif
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
+ zpl_vap_init(vap, dir, mode | S_IFDIR, cr, user_ns);
cookie = spl_fstrans_mark();
- error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL);
+ error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL,
+ user_ns);
if (error == 0) {
error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
if (error == 0)
@@ -371,6 +415,10 @@ static int
zpl_getattr_impl(struct user_namespace *user_ns,
const struct path *path, struct kstat *stat, u32 request_mask,
unsigned int query_flags)
+#elif defined(HAVE_IDMAP_IOPS_GETATTR)
+zpl_getattr_impl(struct mnt_idmap *user_ns,
+ const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
#else
zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
unsigned int query_flags)
@@ -387,7 +435,9 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
* XXX query_flags currently ignored.
*/
-#ifdef HAVE_USERNS_IOPS_GETATTR
+#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
+ error = -zfs_getattr_fast(user_ns, request_mask, ip, stat);
+#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
error = -zfs_getattr_fast(user_ns, ip, stat);
#else
error = -zfs_getattr_fast(kcred->user_ns, ip, stat);
@@ -426,9 +476,12 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
ZPL_GETATTR_WRAPPER(zpl_getattr);
static int
-#ifdef HAVE_SETATTR_PREPARE_USERNS
+#ifdef HAVE_USERNS_IOPS_SETATTR
zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry,
struct iattr *ia)
+#elif defined(HAVE_IDMAP_IOPS_SETATTR)
+zpl_setattr(struct mnt_idmap *user_ns, struct dentry *dentry,
+ struct iattr *ia)
#else
zpl_setattr(struct dentry *dentry, struct iattr *ia)
#endif
@@ -439,7 +492,13 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
int error;
fstrans_cookie_t cookie;
- error = zpl_setattr_prepare(kcred->user_ns, dentry, ia);
+#ifdef HAVE_SETATTR_PREPARE_USERNS
+ error = zpl_setattr_prepare(user_ns, dentry, ia);
+#elif defined(HAVE_SETATTR_PREPARE_IDMAP)
+ error = zpl_setattr_prepare(user_ns, dentry, ia);
+#else
+ error = zpl_setattr_prepare(zfs_init_idmap, dentry, ia);
+#endif
if (error)
return (error);
@@ -447,18 +506,37 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK;
vap->va_mode = ia->ia_mode;
- vap->va_uid = KUID_TO_SUID(ia->ia_uid);
- vap->va_gid = KGID_TO_SGID(ia->ia_gid);
+ if (ia->ia_valid & ATTR_UID)
+#ifdef HAVE_IATTR_VFSID
+ vap->va_uid = zfs_vfsuid_to_uid(user_ns, zfs_i_user_ns(ip),
+ __vfsuid_val(ia->ia_vfsuid));
+#else
+ vap->va_uid = KUID_TO_SUID(ia->ia_uid);
+#endif
+ if (ia->ia_valid & ATTR_GID)
+#ifdef HAVE_IATTR_VFSID
+ vap->va_gid = zfs_vfsgid_to_gid(user_ns, zfs_i_user_ns(ip),
+ __vfsgid_val(ia->ia_vfsgid));
+#else
+ vap->va_gid = KGID_TO_SGID(ia->ia_gid);
+#endif
vap->va_size = ia->ia_size;
vap->va_atime = ia->ia_atime;
vap->va_mtime = ia->ia_mtime;
vap->va_ctime = ia->ia_ctime;
if (vap->va_mask & ATTR_ATIME)
- ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip);
+ zpl_inode_set_atime_to_ts(ip,
+ zpl_inode_timestamp_truncate(ia->ia_atime, ip));
cookie = spl_fstrans_mark();
- error = -zfs_setattr(ITOZ(ip), vap, 0, cr);
+#ifdef HAVE_USERNS_IOPS_SETATTR
+ error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns);
+#elif defined(HAVE_IDMAP_IOPS_SETATTR)
+ error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns);
+#else
+ error = -zfs_setattr(ITOZ(ip), vap, 0, cr, zfs_init_idmap);
+#endif
if (!error && (ia->ia_valid & ATTR_MODE))
error = zpl_chmod_acl(ip);
@@ -474,32 +552,47 @@ static int
#ifdef HAVE_IOPS_RENAME_USERNS
zpl_rename2(struct user_namespace *user_ns, struct inode *sdip,
struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
- unsigned int flags)
+ unsigned int rflags)
+#elif defined(HAVE_IOPS_RENAME_IDMAP)
+zpl_rename2(struct mnt_idmap *user_ns, struct inode *sdip,
+ struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
+ unsigned int rflags)
#else
zpl_rename2(struct inode *sdip, struct dentry *sdentry,
- struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+ struct inode *tdip, struct dentry *tdentry, unsigned int rflags)
#endif
{
cred_t *cr = CRED();
+ vattr_t *wo_vap = NULL;
int error;
fstrans_cookie_t cookie;
-
- /* We don't have renameat2(2) support */
- if (flags)
- return (-EINVAL);
+#if !(defined(HAVE_IOPS_RENAME_USERNS) || defined(HAVE_IOPS_RENAME_IDMAP))
+ zidmap_t *user_ns = kcred->user_ns;
+#endif
crhold(cr);
+ if (rflags & RENAME_WHITEOUT) {
+ wo_vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(wo_vap, sdip, S_IFCHR, cr, user_ns);
+ wo_vap->va_rdev = makedevice(0, 0);
+ }
+
cookie = spl_fstrans_mark();
error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip),
- dname(tdentry), cr, 0);
+ dname(tdentry), cr, 0, rflags, wo_vap, user_ns);
spl_fstrans_unmark(cookie);
+ if (wo_vap)
+ kmem_free(wo_vap, sizeof (vattr_t));
crfree(cr);
ASSERT3S(error, <=, 0);
return (error);
}
-#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS)
+#if !defined(HAVE_IOPS_RENAME_USERNS) && \
+ !defined(HAVE_RENAME_WANTS_FLAGS) && \
+ !defined(HAVE_RENAME2) && \
+ !defined(HAVE_IOPS_RENAME_IDMAP)
static int
zpl_rename(struct inode *sdip, struct dentry *sdentry,
struct inode *tdip, struct dentry *tdentry)
@@ -512,6 +605,9 @@ static int
#ifdef HAVE_IOPS_SYMLINK_USERNS
zpl_symlink(struct user_namespace *user_ns, struct inode *dir,
struct dentry *dentry, const char *name)
+#elif defined(HAVE_IOPS_SYMLINK_IDMAP)
+zpl_symlink(struct mnt_idmap *user_ns, struct inode *dir,
+ struct dentry *dentry, const char *name)
#else
zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
#endif
@@ -521,14 +617,17 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
znode_t *zp;
int error;
fstrans_cookie_t cookie;
+#if !(defined(HAVE_IOPS_SYMLINK_USERNS) || defined(HAVE_IOPS_SYMLINK_IDMAP))
+ zidmap_t *user_ns = kcred->user_ns;
+#endif
crhold(cr);
vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
- zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
+ zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr, user_ns);
cookie = spl_fstrans_mark();
error = -zfs_symlink(ITOZ(dir), dname(dentry), vap,
- (char *)name, &zp, cr, 0);
+ (char *)name, &zp, cr, 0, user_ns);
if (error == 0) {
error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
if (error) {
@@ -678,7 +777,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
return (-EMLINK);
crhold(cr);
- ip->i_ctime = current_time(ip);
+ zpl_inode_set_ctime_to_ts(ip, current_time(ip));
/* Must have an existing ref, so igrab() cannot return NULL */
VERIFY3P(igrab(ip), !=, NULL);
@@ -698,46 +797,6 @@ out:
return (error);
}
-static int
-#ifdef HAVE_D_REVALIDATE_NAMEIDATA
-zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
- unsigned int flags = (nd ? nd->flags : 0);
-#else
-zpl_revalidate(struct dentry *dentry, unsigned int flags)
-{
-#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
- /* CSTYLED */
- zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
- int error;
-
- if (flags & LOOKUP_RCU)
- return (-ECHILD);
-
- /*
- * After a rollback negative dentries created before the rollback
- * time must be invalidated. Otherwise they can obscure files which
- * are only present in the rolled back dataset.
- */
- if (dentry->d_inode == NULL) {
- spin_lock(&dentry->d_lock);
- error = time_before(dentry->d_time, zfsvfs->z_rollback_time);
- spin_unlock(&dentry->d_lock);
-
- if (error)
- return (0);
- }
-
- /*
- * The dentry may reference a stale inode if a mounted file system
- * was rolled back to a point in time where the object didn't exist.
- */
- if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
- return (0);
-
- return (1);
-}
-
const struct inode_operations zpl_inode_operations = {
.setattr = zpl_setattr,
.getattr = zpl_getattr,
@@ -751,11 +810,20 @@ const struct inode_operations zpl_inode_operations = {
#if defined(HAVE_SET_ACL)
.set_acl = zpl_set_acl,
#endif /* HAVE_SET_ACL */
+#if defined(HAVE_GET_INODE_ACL)
+ .get_inode_acl = zpl_get_acl,
+#else
.get_acl = zpl_get_acl,
+#endif /* HAVE_GET_INODE_ACL */
#endif /* CONFIG_FS_POSIX_ACL */
};
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+const struct inode_operations_wrapper zpl_dir_inode_operations = {
+ .ops = {
+#else
const struct inode_operations zpl_dir_inode_operations = {
+#endif
.create = zpl_create,
.lookup = zpl_lookup,
.link = zpl_link,
@@ -764,7 +832,11 @@ const struct inode_operations zpl_dir_inode_operations = {
.mkdir = zpl_mkdir,
.rmdir = zpl_rmdir,
.mknod = zpl_mknod,
-#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
+#ifdef HAVE_RENAME2
+ .rename2 = zpl_rename2,
+#elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS)
+ .rename = zpl_rename2,
+#elif defined(HAVE_IOPS_RENAME_IDMAP)
.rename = zpl_rename2,
#else
.rename = zpl_rename,
@@ -784,8 +856,16 @@ const struct inode_operations zpl_dir_inode_operations = {
#if defined(HAVE_SET_ACL)
.set_acl = zpl_set_acl,
#endif /* HAVE_SET_ACL */
+#if defined(HAVE_GET_INODE_ACL)
+ .get_inode_acl = zpl_get_acl,
+#else
.get_acl = zpl_get_acl,
+#endif /* HAVE_GET_INODE_ACL */
#endif /* CONFIG_FS_POSIX_ACL */
+#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
+ },
+ .rename2 = zpl_rename2,
+#endif
};
const struct inode_operations zpl_symlink_inode_operations = {
@@ -823,10 +903,10 @@ const struct inode_operations zpl_special_inode_operations = {
#if defined(HAVE_SET_ACL)
.set_acl = zpl_set_acl,
#endif /* HAVE_SET_ACL */
+#if defined(HAVE_GET_INODE_ACL)
+ .get_inode_acl = zpl_get_acl,
+#else
.get_acl = zpl_get_acl,
+#endif /* HAVE_GET_INODE_ACL */
#endif /* CONFIG_FS_POSIX_ACL */
};
-
-dentry_operations_t zpl_dentry_operations = {
- .d_revalidate = zpl_revalidate,
-};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index b18efde9b18a..d98d32c1f9fb 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2023, Datto Inc. All rights reserved.
*/
@@ -185,7 +186,9 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data)
static int
__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
{
- ZPL_ENTER(zfsvfs);
+ int error;
+ if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
+ return (error);
char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
dmu_objset_name(zfsvfs->z_os, fsname);
@@ -205,7 +208,7 @@ __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
return (0);
}
@@ -233,6 +236,18 @@ __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
}
#endif /* CONFIG_FS_POSIX_ACL */
+ switch (zfsvfs->z_case) {
+ case ZFS_CASE_SENSITIVE:
+ seq_puts(seq, ",casesensitive");
+ break;
+ case ZFS_CASE_INSENSITIVE:
+ seq_puts(seq, ",caseinsensitive");
+ break;
+ default:
+ seq_puts(seq, ",casemixed");
+ break;
+ }
+
return (0);
}
@@ -262,11 +277,14 @@ zpl_test_super(struct super_block *s, void *data)
{
zfsvfs_t *zfsvfs = s->s_fs_info;
objset_t *os = data;
-
- if (zfsvfs == NULL)
- return (0);
-
- return (os == zfsvfs->z_os);
+ /*
+ * If the os doesn't match the z_os in the super_block, assume it is
+ * not a match. Matching would imply a multimount of a dataset. It is
+ * possible that during a multimount, there is a simultaneous operation
+ * that changes the z_os, e.g., rollback, where the match will be
+ * missed, but in that case the user will get an EBUSY.
+ */
+ return (zfsvfs != NULL && os == zfsvfs->z_os);
}
static struct super_block *
@@ -292,12 +310,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+ /*
+ * Recheck with the lock held to prevent mounting the wrong dataset
+ * since z_os can be stale when the teardown lock is held.
+ *
+ * We can't do this in zpl_test_super in since it's under spinlock and
+ * also s_umount lock is not held there so it would race with
+ * zfs_umount and zfsvfs can be freed.
+ */
+ if (!IS_ERR(s) && s->s_fs_info != NULL) {
+ zfsvfs_t *zfsvfs = s->s_fs_info;
+ if (zpl_enter(zfsvfs, FTAG) == 0) {
+ if (os != zfsvfs->z_os)
+ err = -SET_ERROR(EBUSY);
+ zpl_exit(zfsvfs, FTAG);
+ } else {
+ err = -SET_ERROR(EBUSY);
+ }
+ }
dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
dsl_dataset_rele(dmu_objset_ds(os), FTAG);
if (IS_ERR(s))
return (ERR_CAST(s));
+ if (err) {
+ deactivate_locked_super(s);
+ return (ERR_PTR(err));
+ }
+
if (s->s_root == NULL) {
err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
if (err) {
@@ -334,7 +375,7 @@ zpl_kill_sb(struct super_block *sb)
}
void
-zpl_prune_sb(int64_t nr_to_scan, void *arg)
+zpl_prune_sb(uint64_t nr_to_scan, void *arg)
{
struct super_block *sb = (struct super_block *)arg;
int objects = 0;
@@ -360,7 +401,11 @@ const struct super_operations zpl_super_operations = {
struct file_system_type zpl_fs_type = {
.owner = THIS_MODULE,
.name = ZFS_DRIVER,
+#if defined(HAVE_IDMAP_MNT_API)
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+#else
.fs_flags = FS_USERNS_MOUNT,
+#endif
.mount = zpl_mount,
.kill_sb = zpl_kill_sb,
};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
index 98378109cb9a..4e4f5210f85d 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -246,8 +246,8 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
crhold(cr);
cookie = spl_fstrans_mark();
- ZPL_ENTER(zfsvfs);
- ZPL_VERIFY_ZP(zp);
+ if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ goto out1;
rw_enter(&zp->z_xattr_lock, RW_READER);
if (zfsvfs->z_use_sa && zp->z_is_sa) {
@@ -264,7 +264,8 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
out:
rw_exit(&zp->z_xattr_lock);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
+out1:
spl_fstrans_unmark(cookie);
crfree(cr);
@@ -435,12 +436,13 @@ zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size)
crhold(cr);
cookie = spl_fstrans_mark();
- ZPL_ENTER(zfsvfs);
- ZPL_VERIFY_ZP(zp);
+ if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ goto out;
rw_enter(&zp->z_xattr_lock, RW_READER);
error = __zpl_xattr_get(ip, name, value, size, cr);
rw_exit(&zp->z_xattr_lock);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
+out:
spl_fstrans_unmark(cookie);
crfree(cr);
@@ -497,7 +499,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
vap->va_gid = crgetgid(cr);
error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp,
- cr, 0, NULL);
+ cr, ATTR_NOACLCHECK, NULL, zfs_init_idmap);
if (error)
goto out;
}
@@ -511,7 +513,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
error = -zfs_write_simple(xzp, value, size, pos, NULL);
out:
if (error == 0) {
- ip->i_ctime = current_time(ip);
+ zpl_inode_set_ctime_to_ts(ip, current_time(ip));
zfs_mark_inode_dirty(ip);
}
@@ -604,8 +606,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value,
crhold(cr);
cookie = spl_fstrans_mark();
- ZPL_ENTER(zfsvfs);
- ZPL_VERIFY_ZP(zp);
+ if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+ goto out1;
rw_enter(&zp->z_xattr_lock, RW_WRITER);
/*
@@ -658,7 +660,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value,
zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr);
out:
rw_exit(&zp->z_xattr_lock);
- ZPL_EXIT(zfsvfs);
+ zpl_exit(zfsvfs, FTAG);
+out1:
spl_fstrans_unmark(cookie);
crfree(cr);
ASSERT3S(error, <=, 0);
@@ -735,9 +738,11 @@ __zpl_xattr_user_get(struct inode *ip, const char *name,
ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get);
static int
-__zpl_xattr_user_set(struct inode *ip, const char *name,
+__zpl_xattr_user_set(zidmap_t *user_ns,
+ struct inode *ip, const char *name,
const void *value, size_t size, int flags)
{
+ (void) user_ns;
int error = 0;
/* xattr_resolve_name will do this for us if this is defined */
#ifndef HAVE_XATTR_HANDLER_NAME
@@ -843,9 +848,11 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name,
ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
static int
-__zpl_xattr_trusted_set(struct inode *ip, const char *name,
+__zpl_xattr_trusted_set(zidmap_t *user_ns,
+ struct inode *ip, const char *name,
const void *value, size_t size, int flags)
{
+ (void) user_ns;
char *xattr_name;
int error;
@@ -911,9 +918,11 @@ __zpl_xattr_security_get(struct inode *ip, const char *name,
ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
static int
-__zpl_xattr_security_set(struct inode *ip, const char *name,
+__zpl_xattr_security_set(zidmap_t *user_ns,
+ struct inode *ip, const char *name,
const void *value, size_t size, int flags)
{
+ (void) user_ns;
char *xattr_name;
int error;
/* xattr_resolve_name will do this for us if this is defined */
@@ -937,7 +946,7 @@ zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs,
int error = 0;
for (xattr = xattrs; xattr->name != NULL; xattr++) {
- error = __zpl_xattr_security_set(ip,
+ error = __zpl_xattr_security_set(NULL, ip,
xattr->name, xattr->value, xattr->value_len, 0);
if (error < 0)
@@ -1002,7 +1011,8 @@ zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type)
*/
if (ip->i_mode != mode) {
ip->i_mode = ITOZ(ip)->z_mode = mode;
- ip->i_ctime = current_time(ip);
+ zpl_inode_set_ctime_to_ts(ip,
+ current_time(ip));
zfs_mark_inode_dirty(ip);
}
@@ -1052,11 +1062,23 @@ int
#ifdef HAVE_SET_ACL_USERNS
zpl_set_acl(struct user_namespace *userns, struct inode *ip,
struct posix_acl *acl, int type)
+#elif defined(HAVE_SET_ACL_IDMAP_DENTRY)
+zpl_set_acl(struct mnt_idmap *userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+#elif defined(HAVE_SET_ACL_USERNS_DENTRY_ARG2)
+zpl_set_acl(struct user_namespace *userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
#else
zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type)
#endif /* HAVE_SET_ACL_USERNS */
{
+#ifdef HAVE_SET_ACL_USERNS_DENTRY_ARG2
+ return (zpl_set_acl_impl(d_inode(dentry), acl, type));
+#elif defined(HAVE_SET_ACL_IDMAP_DENTRY)
+ return (zpl_set_acl_impl(d_inode(dentry), acl, type));
+#else
return (zpl_set_acl_impl(ip, acl, type));
+#endif /* HAVE_SET_ACL_USERNS_DENTRY_ARG2 */
}
#endif /* HAVE_SET_ACL */
@@ -1115,7 +1137,7 @@ zpl_get_acl_impl(struct inode *ip, int type)
return (acl);
}
-#if defined(HAVE_GET_ACL_RCU)
+#if defined(HAVE_GET_ACL_RCU) || defined(HAVE_GET_INODE_ACL)
struct posix_acl *
zpl_get_acl(struct inode *ip, int type, bool rcu)
{
@@ -1149,7 +1171,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir)
return (PTR_ERR(acl));
if (!acl) {
ITOZ(ip)->z_mode = (ip->i_mode &= ~current_umask());
- ip->i_ctime = current_time(ip);
+ zpl_inode_set_ctime_to_ts(ip, current_time(ip));
zfs_mark_inode_dirty(ip);
return (0);
}
@@ -1297,7 +1319,8 @@ __zpl_xattr_acl_get_default(struct inode *ip, const char *name,
ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default);
static int
-__zpl_xattr_acl_set_access(struct inode *ip, const char *name,
+__zpl_xattr_acl_set_access(zidmap_t *mnt_ns,
+ struct inode *ip, const char *name,
const void *value, size_t size, int flags)
{
struct posix_acl *acl;
@@ -1311,8 +1334,14 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name,
if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
return (-EOPNOTSUPP);
- if (!zpl_inode_owner_or_capable(kcred->user_ns, ip))
+#if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP)
+ if (!zpl_inode_owner_or_capable(mnt_ns, ip))
return (-EPERM);
+#else
+ (void) mnt_ns;
+ if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
+ return (-EPERM);
+#endif
if (value) {
acl = zpl_acl_from_xattr(value, size);
@@ -1336,7 +1365,8 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name,
ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access);
static int
-__zpl_xattr_acl_set_default(struct inode *ip, const char *name,
+__zpl_xattr_acl_set_default(zidmap_t *mnt_ns,
+ struct inode *ip, const char *name,
const void *value, size_t size, int flags)
{
struct posix_acl *acl;
@@ -1350,8 +1380,14 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name,
if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
return (-EOPNOTSUPP);
- if (!zpl_inode_owner_or_capable(kcred->user_ns, ip))
+#if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP)
+ if (!zpl_inode_owner_or_capable(mnt_ns, ip))
return (-EPERM);
+#else
+ (void) mnt_ns;
+ if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
+ return (-EPERM);
+#endif
if (value) {
acl = zpl_acl_from_xattr(value, size);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index acbab55d03ef..4b960daf89ee 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -6,7 +6,7 @@
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
@@ -37,6 +37,7 @@
#include <sys/spa_impl.h>
#include <sys/zvol.h>
#include <sys/zvol_impl.h>
+#include <cityhash.h>
#include <linux/blkdev_compat.h>
#include <linux/task_io_accounting_ops.h>
@@ -53,8 +54,14 @@ static unsigned int zvol_request_sync = 0;
static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;
+/*
+ * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
+ * to utilize more threads for small files but may affect prefetch hits.
+ */
+#define ZVOL_TASKQ_OFFSET_SHIFT 29
+
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
-static const unsigned int zvol_open_timeout_ms = 1000;
+static unsigned int zvol_open_timeout_ms = 1000;
#endif
static unsigned int zvol_threads = 0;
@@ -76,6 +83,8 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
#endif
+static unsigned int zvol_num_taskqs = 0;
+
#ifndef BLKDEV_DEFAULT_RQ
/* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
#define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
@@ -114,7 +123,11 @@ struct zvol_state_os {
boolean_t use_blk_mq;
};
-taskq_t *zvol_taskq;
+typedef struct zv_taskq {
+ uint_t tqs_cnt;
+ taskq_t **tqs_taskq;
+} zv_taskq_t;
+static zv_taskq_t zvol_taskqs;
static struct ida zvol_ida;
typedef struct zv_request_stack {
@@ -342,8 +355,7 @@ zvol_discard(zv_request_t *zvr)
struct request_queue *q = zv->zv_zso->zvo_queue;
struct gendisk *disk = zv->zv_zso->zvo_disk;
unsigned long start_time = 0;
-
- boolean_t acct = blk_queue_io_stat(q);
+ boolean_t acct = B_FALSE;
ASSERT3P(zv, !=, NULL);
ASSERT3U(zv->zv_open_count, >, 0);
@@ -388,7 +400,7 @@ zvol_discard(zv_request_t *zvr)
if (error != 0) {
dmu_tx_abort(tx);
} else {
- zvol_log_truncate(zv, tx, start, size, B_TRUE);
+ zvol_log_truncate(zv, tx, start, size);
dmu_tx_commit(tx);
error = dmu_free_long_range(zv->zv_objset,
ZVOL_OBJ, start, size);
@@ -513,7 +525,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
uint64_t size = io_size(bio, rq);
int rw = io_data_dir(bio, rq);
- if (zvol_request_sync)
+ if (zvol_request_sync || zv->zv_threading == B_FALSE)
force_sync = 1;
zv_request_t zvr = {
@@ -533,6 +545,22 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
}
zv_request_task_t *task;
+ zv_taskq_t *ztqs = &zvol_taskqs;
+ uint_t blk_mq_hw_queue = 0;
+ uint_t tq_idx;
+ uint_t taskq_hash;
+#ifdef HAVE_BLK_MQ
+ if (rq)
+#ifdef HAVE_BLK_MQ_RQ_HCTX
+ blk_mq_hw_queue = rq->mq_hctx->queue_num;
+#else
+ blk_mq_hw_queue =
+ rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
+#endif
+#endif
+ taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
+ blk_mq_hw_queue, 0);
+ tq_idx = taskq_hash % ztqs->tqs_cnt;
if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
@@ -558,7 +586,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
rw_enter(&zv->zv_suspend_lock, RW_WRITER);
if (zv->zv_zilog == NULL) {
zv->zv_zilog = zil_open(zv->zv_objset,
- zvol_get_data);
+ zvol_get_data, &zv->zv_kstat.dk_zil_sums);
zv->zv_flags |= ZVOL_WRITTEN_TO;
/* replay / destroy done in zvol_create_minor */
VERIFY0((zv->zv_zilog->zl_header->zh_flags &
@@ -602,7 +630,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
zvol_discard(&zvr);
} else {
task = zv_request_task_create(zvr);
- taskq_dispatch_ent(zvol_taskq,
+ taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_discard_task, task, 0, &task->ent);
}
} else {
@@ -610,7 +638,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
zvol_write(&zvr);
} else {
task = zv_request_task_create(zvr);
- taskq_dispatch_ent(zvol_taskq,
+ taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_write_task, task, 0, &task->ent);
}
}
@@ -632,7 +660,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
zvol_read(&zvr);
} else {
task = zv_request_task_create(zvr);
- taskq_dispatch_ent(zvol_taskq,
+ taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
zvol_read_task, task, 0, &task->ent);
}
}
@@ -672,7 +700,11 @@ zvol_request(struct request_queue *q, struct bio *bio)
}
static int
+#ifdef HAVE_BLK_MODE_T
+zvol_open(struct gendisk *disk, blk_mode_t flag)
+#else
zvol_open(struct block_device *bdev, fmode_t flag)
+#endif
{
zvol_state_t *zv;
int error = 0;
@@ -687,10 +719,14 @@ retry:
/*
* Obtain a copy of private_data under the zvol_state_lock to make
* sure that either the result of zvol free code path setting
- * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free()
+ * disk->private_data to NULL is observed, or zvol_os_free()
* is not called on this zv because of the positive zv_open_count.
*/
+#ifdef HAVE_BLK_MODE_T
+ zv = disk->private_data;
+#else
zv = bdev->bd_disk->private_data;
+#endif
if (zv == NULL) {
rw_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
@@ -770,14 +806,15 @@ retry:
}
}
- error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+ error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
}
if (error == 0) {
- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ if ((blk_mode_is_open_write(flag)) &&
+ (zv->zv_flags & ZVOL_RDONLY)) {
if (zv->zv_open_count == 0)
zvol_last_close(zv);
@@ -792,14 +829,25 @@ retry:
rw_exit(&zv->zv_suspend_lock);
if (error == 0)
+#ifdef HAVE_BLK_MODE_T
+ disk_check_media_change(disk);
+#else
zfs_check_media_change(bdev);
+#endif
return (error);
}
static void
-zvol_release(struct gendisk *disk, fmode_t mode)
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
+zvol_release(struct gendisk *disk)
+#else
+zvol_release(struct gendisk *disk, fmode_t unused)
+#endif
{
+#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
+ (void) unused;
+#endif
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
@@ -854,7 +902,13 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
switch (cmd) {
case BLKFLSBUF:
+#ifdef HAVE_FSYNC_BDEV
fsync_bdev(bdev);
+#elif defined(HAVE_SYNC_BLOCKDEV)
+ sync_blockdev(bdev);
+#else
+#error "Neither fsync_bdev() nor sync_blockdev() found"
+#endif
invalidate_bdev(bdev);
rw_enter(&zv->zv_suspend_lock, RW_READER);
@@ -1030,6 +1084,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
zso->zvo_disk->minors = ZVOL_MINORS;
zso->zvo_queue = zso->zvo_disk->queue;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+ struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+ if (IS_ERR(disk)) {
+ zso->zvo_disk = NULL;
+ return (1);
+ }
+
+ zso->zvo_disk = disk;
+ zso->zvo_disk->minors = ZVOL_MINORS;
+ zso->zvo_queue = zso->zvo_disk->queue;
#else
zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
if (zso->zvo_queue == NULL)
@@ -1078,6 +1142,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
}
zso->zvo_queue = zso->zvo_disk->queue;
zso->zvo_disk->minors = ZVOL_MINORS;
+#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
+ struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
+ if (IS_ERR(disk)) {
+ zso->zvo_disk = NULL;
+ blk_mq_free_tag_set(&zso->tag_set);
+ return (1);
+ }
+
+ zso->zvo_disk = disk;
+ zso->zvo_queue = zso->zvo_disk->queue;
+ zso->zvo_disk->minors = ZVOL_MINORS;
#else
zso->zvo_disk = alloc_disk(ZVOL_MINORS);
if (zso->zvo_disk == NULL) {
@@ -1174,7 +1249,7 @@ zvol_alloc(dev_t dev, const char *name)
zso->zvo_queue->queuedata = zv;
zso->zvo_dev = dev;
zv->zv_open_count = 0;
- strlcpy(zv->zv_name, name, MAXNAMELEN);
+ strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
@@ -1231,9 +1306,13 @@ zvol_os_free(zvol_state_t *zv)
del_gendisk(zv->zv_zso->zvo_disk);
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
- defined(HAVE_BLK_ALLOC_DISK)
+ (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
+#if defined(HAVE_BLK_CLEANUP_DISK)
blk_cleanup_disk(zv->zv_zso->zvo_disk);
#else
+ put_disk(zv->zv_zso->zvo_disk);
+#endif
+#else
blk_cleanup_queue(zv->zv_zso->zvo_queue);
put_disk(zv->zv_zso->zvo_disk);
#endif
@@ -1275,6 +1354,8 @@ zvol_os_create_minor(const char *name)
int error = 0;
int idx;
uint64_t hash = zvol_name_hash(name);
+ uint64_t volthreading;
+ bool replayed_zil = B_FALSE;
if (zvol_inhibit_dev)
return (0);
@@ -1283,6 +1364,13 @@ zvol_os_create_minor(const char *name)
if (idx < 0)
return (SET_ERROR(-idx));
minor = idx << ZVOL_MINOR_BITS;
+ if (MINOR(minor) != minor) {
+ /* too many partitions can cause an overflow */
+ zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
+ name, minor, MINOR(minor));
+ ida_simple_remove(&zvol_ida, idx);
+ return (SET_ERROR(EINVAL));
+ }
zv = zvol_find_by_name_hash(name, hash, RW_NONE);
if (zv) {
@@ -1320,6 +1408,12 @@ zvol_os_create_minor(const char *name)
zv->zv_volsize = volsize;
zv->zv_objset = os;
+ /* Default */
+ zv->zv_threading = B_TRUE;
+ if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
+ == 0)
+ zv->zv_threading = volthreading;
+
set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
@@ -1408,18 +1502,21 @@ zvol_os_create_minor(const char *name)
blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
#endif
+ ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+ error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
+ if (error)
+ goto out_dmu_objset_disown;
ASSERT3P(zv->zv_zilog, ==, NULL);
- zv->zv_zilog = zil_open(os, zvol_get_data);
+ zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
if (spa_writeable(dmu_objset_spa(os))) {
if (zil_replay_disable)
- zil_destroy(zv->zv_zilog, B_FALSE);
+ replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
else
- zil_replay(os, zv, zvol_replay_vector);
+ replayed_zil = zil_replay(os, zv, zvol_replay_vector);
}
- zil_close(zv->zv_zilog);
+ if (replayed_zil)
+ zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
- ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
- dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
/*
* When udev detects the addition of the device it will immediately
@@ -1427,7 +1524,7 @@ zvol_os_create_minor(const char *name)
* Prefetching the blocks commonly scanned by blkid(8) will speed
* up this process.
*/
- len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+ len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
if (len > 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
@@ -1488,6 +1585,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
*/
set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
set_disk_ro(zv->zv_zso->zvo_disk, readonly);
+
+ dataset_kstats_rename(&zv->zv_kstat, newname);
}
void
@@ -1528,8 +1627,40 @@ zvol_init(void)
zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
}
+ /*
+ * Use atleast 32 zvol_threads but for many core system,
+ * prefer 6 threads per taskq, but no more taskqs
+ * than threads in them on large systems.
+ *
+ * taskq total
+ * cpus taskqs threads threads
+ * ------- ------- ------- -------
+ * 1 1 32 32
+ * 2 1 32 32
+ * 4 1 32 32
+ * 8 2 16 32
+ * 16 3 11 33
+ * 32 5 7 35
+ * 64 8 8 64
+ * 128 11 12 132
+ * 256 16 16 256
+ */
+ zv_taskq_t *ztqs = &zvol_taskqs;
+ uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
+ if (num_tqs == 0) {
+ num_tqs = 1 + num_online_cpus() / 6;
+ while (num_tqs * num_tqs > zvol_actual_threads)
+ num_tqs--;
+ }
+ uint_t per_tq_thread = zvol_actual_threads / num_tqs;
+ if (per_tq_thread * num_tqs < zvol_actual_threads)
+ per_tq_thread++;
+ ztqs->tqs_cnt = num_tqs;
+ ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
error = register_blkdev(zvol_major, ZVOL_DRIVER);
if (error) {
+ kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
+ ztqs->tqs_taskq = NULL;
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
return (error);
}
@@ -1549,11 +1680,22 @@ zvol_init(void)
1024);
}
#endif
- zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
- zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
- if (zvol_taskq == NULL) {
- unregister_blkdev(zvol_major, ZVOL_DRIVER);
- return (-ENOMEM);
+ for (uint_t i = 0; i < num_tqs; i++) {
+ char name[32];
+ (void) snprintf(name, sizeof (name), "%s_tq-%u",
+ ZVOL_DRIVER, i);
+ ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
+ maxclsyspri, per_tq_thread, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ if (ztqs->tqs_taskq[i] == NULL) {
+ for (int j = i - 1; j >= 0; j--)
+ taskq_destroy(ztqs->tqs_taskq[j]);
+ unregister_blkdev(zvol_major, ZVOL_DRIVER);
+ kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+ sizeof (taskq_t *));
+ ztqs->tqs_taskq = NULL;
+ return (-ENOMEM);
+ }
}
zvol_init_impl();
@@ -1564,9 +1706,22 @@ zvol_init(void)
void
zvol_fini(void)
{
+ zv_taskq_t *ztqs = &zvol_taskqs;
zvol_fini_impl();
unregister_blkdev(zvol_major, ZVOL_DRIVER);
- taskq_destroy(zvol_taskq);
+
+ if (ztqs->tqs_taskq == NULL) {
+ ASSERT3U(ztqs->tqs_cnt, ==, 0);
+ } else {
+ for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
+ ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
+ taskq_destroy(ztqs->tqs_taskq[i]);
+ }
+ kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
+ sizeof (taskq_t *));
+ ztqs->tqs_taskq = NULL;
+ }
+
ida_destroy(&zvol_ida);
}
@@ -1587,6 +1742,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+module_param(zvol_num_taskqs, uint, 0444);
+MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
+
module_param(zvol_prefetch_bytes, uint, 0644);
MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
@@ -1605,4 +1763,9 @@ MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
"Process volblocksize blocks per thread");
#endif
+#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+module_param(zvol_open_timeout_ms, uint, 0644);
+MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
+#endif
+
/* END CSTYLED */