diff options
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux')
48 files changed, 3667 insertions, 1394 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c index d0461a9f1298..5898789ad53d 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c @@ -20,7 +20,7 @@ * You should have received a copy of the GNU General Public License along * with the SPL. If not, see <http://www.gnu.org/licenses/>. * - * Solaris Porting Layer (SPL) Credential Implementation. + * Solaris Porting Layer (SPL) Condition Variables Implementation. */ #include <sys/condvar.h> @@ -37,7 +37,7 @@ #endif #define MAX_HRTIMEOUT_SLACK_US 1000 -unsigned int spl_schedule_hrtimeout_slack_us = 0; +static unsigned int spl_schedule_hrtimeout_slack_us = 0; static int param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c index f81b9540a639..d407fc66b2de 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c @@ -145,6 +145,18 @@ crgetgid(const cred_t *cr) return (KGID_TO_SGID(cr->fsgid)); } +/* Return the initial user ns or nop_mnt_idmap */ +zidmap_t * +zfs_get_init_idmap(void) +{ +#ifdef HAVE_IOPS_CREATE_IDMAP + return ((zidmap_t *)&nop_mnt_idmap); +#else + return ((zidmap_t *)&init_user_ns); +#endif +} + +EXPORT_SYMBOL(zfs_get_init_idmap); EXPORT_SYMBOL(crhold); EXPORT_SYMBOL(crfree); EXPORT_SYMBOL(crgetuid); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c index c84c39b56bf7..29781b9515b2 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c @@ -32,7 +32,7 @@ * analysis and other such goodies. * But we would still default to the current default of not to do that. */ -unsigned int spl_panic_halt; +static unsigned int spl_panic_halt; /* CSTYLED */ module_param(spl_panic_halt, uint, 0644); MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); @@ -45,7 +45,7 @@ spl_dumpstack(void) } EXPORT_SYMBOL(spl_dumpstack); -int +void spl_panic(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; @@ -75,7 +75,6 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) schedule(); /* Unreachable */ - return (1); } EXPORT_SYMBOL(spl_panic); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index 5179100d1665..986db1518456 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -23,6 +23,7 @@ * Solaris Porting Layer (SPL) Generic Implementation. */ +#include <sys/isa_defs.h> #include <sys/sysmacros.h> #include <sys/systeminfo.h> #include <sys/vmsystm.h> @@ -47,6 +48,8 @@ #include <linux/mod_compat.h> #include <sys/cred.h> #include <sys/vnode.h> +#include <sys/misc.h> +#include <linux/mod_compat.h> unsigned long spl_hostid = 0; EXPORT_SYMBOL(spl_hostid); @@ -59,10 +62,10 @@ proc_t p0; EXPORT_SYMBOL(p0); /* - * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna + * xoshiro256++ 1.0 PRNG by David Blackman and Sebastiano Vigna * - * "Further scramblings of Marsaglia's xorshift generators" - * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * "Scrambled Linear Pseudorandom Number Generators∗" + * https://vigna.di.unimi.it/ftp/papers/ScrambledLinear.pdf * * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose * is to provide bytes containing random numbers. It is mapped to /dev/urandom @@ -74,66 +77,85 @@ EXPORT_SYMBOL(p0); * free of atomic instructions. * * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() - * to generate words larger than 128 bits will paradoxically be limited to - * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` - * 128-bit words and selecting the first will implicitly select the second. If + * to generate words larger than 256 bits will paradoxically be limited to + * `2^256 - 1` possibilities. This is because we have a sequence of `2^256 - 1` + * 256-bit words and selecting the first will implicitly select the second. If * a caller finds this behavior undesirable, random_get_bytes() should be used * instead. * * XXX: Linux interrupt handlers that trigger within the critical section - * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will + * formed by `s[3] = xp[3];` and `xp[0] = s[0];` and call this function will * see the same numbers. Nothing in the code currently calls this in an * interrupt handler, so this is considered to be okay. If that becomes a * problem, we could create a set of per-cpu variables for interrupt handlers * and use them when in_interrupt() from linux/preempt_mask.h evaluates to * true. */ -void __percpu *spl_pseudo_entropy; +static void __percpu *spl_pseudo_entropy; /* - * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed - * file: + * rotl()/spl_rand_next()/spl_rand_jump() are copied from the following CC-0 + * licensed file: * - * http://xorshift.di.unimi.it/xorshift128plus.c + * https://prng.di.unimi.it/xoshiro256plusplus.c */ +static inline uint64_t rotl(const uint64_t x, int k) +{ + return ((x << k) | (x >> (64 - k))); +} + static inline uint64_t spl_rand_next(uint64_t *s) { - uint64_t s1 = s[0]; - const uint64_t s0 = s[1]; - s[0] = s0; - s1 ^= s1 << 23; // a - s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c - return (s[1] + s0); + const uint64_t result = rotl(s[0] + s[3], 23) + s[0]; + + const uint64_t t = s[1] << 17; + + s[2] ^= s[0]; + s[3] ^= s[1]; + s[1] ^= s[2]; + s[0] ^= s[3]; + + s[2] ^= t; + + s[3] = rotl(s[3], 45); + + return (result); } static inline void spl_rand_jump(uint64_t *s) { - static const uint64_t JUMP[] = - { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; + static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, + 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; uint64_t s0 = 0; uint64_t s1 = 0; + uint64_t s2 = 0; + uint64_t s3 = 0; int i, b; for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) for (b = 0; b < 64; b++) { if (JUMP[i] & 1ULL << b) { s0 ^= s[0]; s1 ^= s[1]; + s2 ^= s[2]; + s3 ^= s[3]; } (void) spl_rand_next(s); } s[0] = s0; s[1] = s1; + s[2] = s2; + s[3] = s3; } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { - uint64_t *xp, s[2]; + uint64_t *xp, s[4]; ASSERT(ptr); @@ -141,6 +163,8 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) s[0] = xp[0]; s[1] = xp[1]; + s[2] = xp[2]; + s[3] = xp[3]; while (len) { union { @@ -152,12 +176,22 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) len -= i; entropy.ui64 = spl_rand_next(s); + /* + * xoshiro256++ has low entropy lower bytes, so we copy the + * higher order bytes first. + */ while (i--) +#ifdef _ZFS_BIG_ENDIAN *ptr++ = entropy.byte[i]; +#else + *ptr++ = entropy.byte[7 - i]; +#endif } xp[0] = s[0]; xp[1] = s[1]; + xp[2] = s[2]; + xp[3] = s[3]; put_cpu_ptr(spl_pseudo_entropy); @@ -220,8 +254,10 @@ __div_u64(uint64_t u, uint32_t v) * replacements for libgcc-provided functions and will never be called * directly. */ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmissing-prototypes" +#endif /* * Implementation of 64-bit unsigned division for 32-bit machines. @@ -415,7 +451,9 @@ __aeabi_ldivmod(int64_t u, int64_t v) EXPORT_SYMBOL(__aeabi_ldivmod); #endif /* __arm || __arm__ */ +#if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop +#endif #endif /* BITS_PER_LONG */ @@ -458,7 +496,7 @@ int ddi_strto##type(const char *str, char **endptr, \ if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \ base = 16; /* hex */ \ ptr += 2; \ - } else if (str[1] >= '0' && str[1] < 8) { \ + } else if (str[1] >= '0' && str[1] < '8') { \ base = 8; /* octal */ \ ptr += 1; \ } else { \ @@ -517,6 +555,61 @@ ddi_copyin(const void *from, void *to, size_t len, int flags) } EXPORT_SYMBOL(ddi_copyin); +#define define_spl_param(type, fmt) \ +int \ +spl_param_get_##type(char *buf, zfs_kernel_param_t *kp) \ +{ \ + return (scnprintf(buf, PAGE_SIZE, fmt "\n", \ + *(type *)kp->arg)); \ +} \ +int \ +spl_param_set_##type(const char *buf, zfs_kernel_param_t *kp) \ +{ \ + return (kstrto##type(buf, 0, (type *)kp->arg)); \ +} \ +const struct kernel_param_ops spl_param_ops_##type = { \ + .set = spl_param_set_##type, \ + .get = spl_param_get_##type, \ +}; \ +EXPORT_SYMBOL(spl_param_get_##type); \ +EXPORT_SYMBOL(spl_param_set_##type); \ +EXPORT_SYMBOL(spl_param_ops_##type); + +define_spl_param(s64, "%lld") +define_spl_param(u64, "%llu") + +/* + * Post a uevent to userspace whenever a new vdev adds to the pool. It is + * necessary to sync blkid information with udev, which zed daemon uses + * during device hotplug to identify the vdev. + */ +void +spl_signal_kobj_evt(struct block_device *bdev) +{ +#if defined(HAVE_BDEV_KOBJ) || defined(HAVE_PART_TO_DEV) +#ifdef HAVE_BDEV_KOBJ + struct kobject *disk_kobj = bdev_kobj(bdev); +#else + struct kobject *disk_kobj = &part_to_dev(bdev->bd_part)->kobj; +#endif + if (disk_kobj) { + int ret = kobject_uevent(disk_kobj, KOBJ_CHANGE); + if (ret) { + pr_warn("ZFS: Sending event '%d' to kobject: '%s'" + " (%p): failed(ret:%d)\n", KOBJ_CHANGE, + kobject_name(disk_kobj), disk_kobj, ret); + } + } +#else +/* + * This is encountered if neither bdev_kobj() nor part_to_dev() is available + * in the kernel - likely due to an API change that needs to be chased down. + */ +#error "Unsupported kernel: unable to get struct kobj from bdev" +#endif +} +EXPORT_SYMBOL(spl_signal_kobj_evt); + int ddi_copyout(const void *from, void *to, size_t len, int flags) { @@ -705,28 +798,33 @@ spl_kvmem_init(void) * initialize each of the per-cpu seeds so that the sequences generated on each * CPU are guaranteed to never overlap in practice. */ -static void __init +static int __init spl_random_init(void) { - uint64_t s[2]; + uint64_t s[4]; int i = 0; - spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t), + spl_pseudo_entropy = __alloc_percpu(4 * sizeof (uint64_t), sizeof (uint64_t)); + if (!spl_pseudo_entropy) + return (-ENOMEM); + get_random_bytes(s, sizeof (s)); - if (s[0] == 0 && s[1] == 0) { + if (s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0) { if (jiffies != 0) { s[0] = jiffies; s[1] = ~0 - jiffies; + s[2] = ~jiffies; + s[3] = jiffies - ~0; } else { - (void) memcpy(s, "improbable seed", sizeof (s)); + (void) memcpy(s, "improbable seed", 16); } printk("SPL: get_random_bytes() returned 0 " "when generating random seed. Setting initial seed to " - "0x%016llx%016llx.\n", cpu_to_be64(s[0]), - cpu_to_be64(s[1])); + "0x%016llx%016llx%016llx%016llx.\n", cpu_to_be64(s[0]), + cpu_to_be64(s[1]), cpu_to_be64(s[2]), cpu_to_be64(s[3])); } for_each_possible_cpu(i) { @@ -736,7 +834,11 @@ spl_random_init(void) wordp[0] = s[0]; wordp[1] = s[1]; + wordp[2] = s[2]; + wordp[3] = s[3]; } + + return (0); } static void @@ -757,7 +859,8 @@ spl_init(void) { int rc = 0; - spl_random_init(); + if ((rc = spl_random_init())) + goto out0; if ((rc = spl_kvmem_init())) goto out1; @@ -800,6 +903,8 @@ out3: out2: spl_kvmem_fini(); out1: + spl_random_fini(); +out0: return (rc); } diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c index ba4ca49a2ac9..42821ad60256 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -28,6 +28,7 @@ #include <sys/timer.h> #include <sys/vmem.h> #include <sys/wait.h> +#include <sys/string.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/prefetch.h> @@ -76,17 +77,6 @@ module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); -/* - * The default behavior is to report the number of objects remaining in the - * cache. This allows the Linux VM to repeatedly reclaim objects from the - * cache when memory is low satisfy other memory allocations. Alternately, - * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache - * is reclaimed. This may increase the likelihood of out of memory events. - */ -static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; -module_param(spl_kmem_cache_reclaim, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); - static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); @@ -102,7 +92,8 @@ MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); * of 16K was determined to be optimal for architectures using 4K pages and * to also work well on architecutres using larger 64K page sizes. */ -static unsigned int spl_kmem_cache_slab_limit = 16384; +static unsigned int spl_kmem_cache_slab_limit = + SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE; module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); @@ -151,7 +142,7 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ -taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ +static taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); @@ -182,8 +173,11 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += size >> PAGE_SHIFT; +#else current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; - +#endif vfree(ptr); } @@ -701,12 +695,12 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + skc->skc_name = kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { kfree(skc); return (NULL); } - strncpy(skc->skc_name, name, skc->skc_name_size); + strlcpy(skc->skc_name, name, skc->skc_name_size); skc->skc_ctor = ctor; skc->skc_dtor = dtor; @@ -791,10 +785,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, } else { unsigned long slabflags = 0; - if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { - rc = EINVAL; + if (size > spl_kmem_cache_slab_limit) goto out; - } #if defined(SLAB_USERCOPY) /* @@ -815,10 +807,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, skc->skc_linux_cache = kmem_cache_create( skc->skc_name, size, align, slabflags, NULL); #endif - if (skc->skc_linux_cache == NULL) { - rc = ENOMEM; + if (skc->skc_linux_cache == NULL) goto out; - } } down_write(&spl_kmem_cache_sem); @@ -1016,10 +1006,20 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); - might_sleep(); + *obj = NULL; /* + * Since we can't sleep attempt an emergency allocation to satisfy + * the request. The only alterative is to fail the allocation but + * it's preferable try. The use of KM_NOSLEEP is expected to be rare. + */ + if (flags & KM_NOSLEEP) + return (spl_emergency_alloc(skc, flags, obj)); + + might_sleep(); + + /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. */ @@ -1452,6 +1452,9 @@ spl_kmem_cache_init(void) spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (spl_kmem_cache_taskq == NULL) + return (-ENOMEM); + return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c index c6d3c8f4413f..ad553a73a69e 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c @@ -32,6 +32,7 @@ #include <sys/vmem.h> #include <sys/cmn_err.h> #include <sys/sysmacros.h> +#include <sys/string.h> static kmutex_t kstat_module_lock; static struct list_head kstat_module_list; @@ -390,7 +391,7 @@ kstat_create_module(char *name) module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); module->ksm_proc = pde; - strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); + strlcpy(module->ksm_name, name, KSTAT_STRLEN); INIT_LIST_HEAD(&module->ksm_kstat_list); list_add_tail(&module->ksm_module_list, &kstat_module_list); @@ -479,8 +480,8 @@ kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, kpep->kpe_owner = NULL; kpep->kpe_proc = NULL; INIT_LIST_HEAD(&kpep->kpe_list); - strncpy(kpep->kpe_module, module, KSTAT_STRLEN); - strncpy(kpep->kpe_name, name, KSTAT_STRLEN); + strlcpy(kpep->kpe_module, module, sizeof (kpep->kpe_module)); + strlcpy(kpep->kpe_name, name, sizeof (kpep->kpe_name)); } EXPORT_SYMBOL(kstat_proc_entry_init); @@ -514,7 +515,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, ksp->ks_crtime = gethrtime(); ksp->ks_snaptime = ksp->ks_crtime; ksp->ks_instance = ks_instance; - strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); + strlcpy(ksp->ks_class, ks_class, sizeof (ksp->ks_class)); ksp->ks_type = ks_type; ksp->ks_flags = ks_flags; ksp->ks_update = kstat_default_update; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c index 01f5619e1893..f0f929d3ce90 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c @@ -47,6 +47,10 @@ static unsigned long table_min = 0; static unsigned long table_max = ~0; static struct ctl_table_header *spl_header = NULL; +#ifndef HAVE_REGISTER_SYSCTL_TABLE +static struct ctl_table_header *spl_kmem = NULL; +static struct ctl_table_header *spl_kstat = NULL; +#endif static struct proc_dir_entry *proc_spl = NULL; static struct proc_dir_entry *proc_spl_kmem = NULL; static struct proc_dir_entry *proc_spl_kmem_slab = NULL; @@ -624,6 +628,7 @@ static struct ctl_table spl_table[] = { .mode = 0644, .proc_handler = &proc_dohostid, }, +#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, @@ -634,9 +639,11 @@ static struct ctl_table spl_table[] = { .mode = 0555, .child = spl_kstat_table, }, +#endif {}, }; +#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", @@ -648,21 +655,64 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; +#endif + +static void spl_proc_cleanup(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + +#ifndef HAVE_REGISTER_SYSCTL_TABLE + if (spl_kstat) { + unregister_sysctl_table(spl_kstat); + spl_kstat = NULL; + } + if (spl_kmem) { + unregister_sysctl_table(spl_kmem); + spl_kmem = NULL; + } +#endif + if (spl_header) { + unregister_sysctl_table(spl_header); + spl_header = NULL; + } +} int spl_proc_init(void) { int rc = 0; +#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); +#else + spl_header = register_sysctl("kernel/spl", spl_table); + if (spl_header == NULL) + return (-EUNATCH); + + spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table); + if (spl_kmem == NULL) { + rc = -EUNATCH; + goto out; + } + spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table); + if (spl_kstat == NULL) { + rc = -EUNATCH; + goto out; + } +#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { @@ -703,15 +753,8 @@ spl_proc_init(void) goto out; } out: - if (rc) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - unregister_sysctl_table(spl_header); - } + if (rc) + spl_proc_cleanup(); return (rc); } @@ -719,13 +762,5 @@ out: void spl_proc_fini(void) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - - ASSERT(spl_header != NULL); - unregister_sysctl_table(spl_header); + spl_proc_cleanup(); } diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c index 81501460f04f..5e073950d61a 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -23,9 +23,9 @@ */ #include <sys/list.h> -#include <sys/mutex.h> #include <sys/procfs_list.h> #include <linux/proc_fs.h> +#include <sys/mutex.h> /* * A procfs_list is a wrapper around a linked list which implements the seq_file diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-shrinker.c b/sys/contrib/openzfs/module/os/linux/spl/spl-shrinker.c new file mode 100644 index 000000000000..d5c8da471cbb --- /dev/null +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-shrinker.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <behlendorf1@llnl.gov>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Shrinker Implementation. + */ + +#include <sys/kmem.h> +#include <sys/shrinker.h> + +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK +/* 3.0-3.11: single shrink() callback, which we wrap to carry both functions */ +struct spl_shrinker_wrap { + struct shrinker shrinker; + spl_shrinker_cb countfunc; + spl_shrinker_cb scanfunc; +}; + +static int +spl_shrinker_single_cb(struct shrinker *shrinker, struct shrink_control *sc) +{ + struct spl_shrinker_wrap *sw = (struct spl_shrinker_wrap *)shrinker; + + if (sc->nr_to_scan != 0) + (void) sw->scanfunc(&sw->shrinker, sc); + return (sw->countfunc(&sw->shrinker, sc)); +} +#endif + +struct shrinker * +spl_register_shrinker(const char *name, spl_shrinker_cb countfunc, + spl_shrinker_cb scanfunc, int seek_cost) +{ + struct shrinker *shrinker; + + /* allocate shrinker */ +#if defined(HAVE_SHRINKER_REGISTER) + /* 6.7: kernel will allocate the shrinker for us */ + shrinker = shrinker_alloc(0, name); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + /* 3.12-6.6: we allocate the shrinker */ + shrinker = kmem_zalloc(sizeof (struct shrinker), KM_SLEEP); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + /* 3.0-3.11: allocate a wrapper */ + struct spl_shrinker_wrap *sw = + kmem_zalloc(sizeof (struct spl_shrinker_wrap), KM_SLEEP); + shrinker = &sw->shrinker; +#else + /* 2.x-2.6.22, or a newer shrinker API has been introduced. */ +#error "Unknown shrinker API" +#endif + + if (shrinker == NULL) + return (NULL); + + /* set callbacks */ +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK + sw->countfunc = countfunc; + sw->scanfunc = scanfunc; + shrinker->shrink = spl_shrinker_single_cb; +#else + shrinker->count_objects = countfunc; + shrinker->scan_objects = scanfunc; +#endif + + /* set params */ + shrinker->seeks = seek_cost; + + /* register with kernel */ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_register(shrinker); +#elif defined(HAVE_REGISTER_SHRINKER_VARARG) + register_shrinker(shrinker, name); +#else + register_shrinker(shrinker); +#endif + + return (shrinker); +} +EXPORT_SYMBOL(spl_register_shrinker); + +void +spl_unregister_shrinker(struct shrinker *shrinker) +{ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_free(shrinker); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct shrinker)); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct spl_shrinker_wrap)); +#else +#error "Unknown shrinker API" +#endif +} +EXPORT_SYMBOL(spl_unregister_shrinker); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index 0aab148975aa..c384b7b378c3 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); +static uint_t spl_taskq_thread_timeout_ms = 5000; +/* BEGIN CSTYLED */ +module_param(spl_taskq_thread_timeout_ms, uint, 0644); +/* END CSTYLED */ +MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, + "Minimum idle threads exit interval for dynamic taskqs"); static int spl_taskq_thread_dynamic = 1; module_param(spl_taskq_thread_dynamic, int, 0444); @@ -46,8 +52,10 @@ module_param(spl_taskq_thread_priority, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_priority, "Allow non-default priority for taskq threads"); -static int spl_taskq_thread_sequential = 4; -module_param(spl_taskq_thread_sequential, int, 0644); +static uint_t spl_taskq_thread_sequential = 4; +/* BEGIN CSTYLED */ +module_param(spl_taskq_thread_sequential, uint, 0644); +/* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_sequential, "Create new taskq threads after N sequential tasks"); @@ -586,8 +594,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) ASSERT(tq->tq_nactive <= tq->tq_nthreads); if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ - if (!(tq->tq_flags & TASKQ_DYNAMIC) || - taskq_thread_spawn(tq) == 0) + if (taskq_thread_spawn(tq) == 0) goto out; } @@ -621,11 +628,11 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); -out: + /* Spawn additional taskq threads if required. */ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); - +out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } @@ -668,10 +675,11 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); spin_unlock(&t->tqent_lock); -out: + /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); +out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } @@ -696,9 +704,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ - if (!(tq->tq_flags & TASKQ_DYNAMIC) || - taskq_thread_spawn(tq) == 0) - goto out2; + if (taskq_thread_spawn(tq) == 0) + goto out; flags |= TQ_FRONT; } @@ -734,11 +741,11 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); -out: + /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); -out2: +out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); } EXPORT_SYMBOL(taskq_dispatch_ent); @@ -817,6 +824,7 @@ taskq_thread_spawn(taskq_t *tq) if (!(tq->tq_flags & TASKQ_DYNAMIC)) return (0); + tq->lastspawnstop = jiffies; if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && (tq->tq_flags & TASKQ_ACTIVE)) { spawning = (++tq->tq_nspawn); @@ -828,9 +836,9 @@ taskq_thread_spawn(taskq_t *tq) } /* - * Threads in a dynamic taskq should only exit once it has been completely - * drained and no other threads are actively servicing tasks. This prevents - * threads from being created and destroyed more than is required. + * Threads in a dynamic taskq may exit once there is no more work to do. + * To prevent threads from being created and destroyed too often limit + * the exit rate to one per spl_taskq_thread_timeout_ms. * * The first thread is the thread list is treated as the primary thread. * There is nothing special about the primary thread but in order to avoid @@ -839,19 +847,22 @@ taskq_thread_spawn(taskq_t *tq) static int taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) { - if (!(tq->tq_flags & TASKQ_DYNAMIC)) + ASSERT(!taskq_next_ent(tq)); + if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic) return (0); - + if (!(tq->tq_flags & TASKQ_ACTIVE)) + return (1); if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, tqt_thread_list) == tqt) return (0); - - return - ((tq->tq_nspawn == 0) && /* No threads are being spawned */ - (tq->tq_nactive == 0) && /* No threads are handling tasks */ - (tq->tq_nthreads > 1) && /* More than 1 thread is running */ - (!taskq_next_ent(tq)) && /* There are no pending tasks */ - (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ + ASSERT3U(tq->tq_nthreads, >, 1); + if (tq->tq_nspawn != 0) + return (0); + if (time_before(jiffies, tq->lastspawnstop + + msecs_to_jiffies(spl_taskq_thread_timeout_ms))) + return (0); + tq->lastspawnstop = jiffies; + return (1); } static int @@ -902,10 +913,8 @@ taskq_thread(void *args) if (list_empty(&tq->tq_pend_list) && list_empty(&tq->tq_prio_list)) { - if (taskq_thread_should_stop(tq, tqt)) { - wake_up_all(&tq->tq_wait_waitq); + if (taskq_thread_should_stop(tq, tqt)) break; - } add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); spin_unlock_irqrestore(&tq->tq_lock, flags); @@ -980,9 +989,6 @@ taskq_thread(void *args) tqt->tqt_id = TASKQID_INVALID; tqt->tqt_flags = 0; wake_up_all(&tq->tq_wait_waitq); - } else { - if (taskq_thread_should_stop(tq, tqt)) - break; } set_current_state(TASK_INTERRUPTIBLE); @@ -1046,7 +1052,6 @@ taskq_create(const char *name, int threads_arg, pri_t pri, ASSERT(name != NULL); ASSERT(minalloc >= 0); - ASSERT(maxalloc <= INT_MAX); ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ /* Scale the number of threads using nthreads as a percentage */ @@ -1090,6 +1095,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri, tq->tq_flags = (flags | TASKQ_ACTIVE); tq->tq_next_id = TASKQID_INITIAL; tq->tq_lowest_id = TASKQID_INITIAL; + tq->lastspawnstop = jiffies; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); @@ -1229,6 +1235,42 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); +/* + * Create a taskq with a specified number of pool threads. Allocate + * and return an array of nthreads kthread_t pointers, one for each + * thread in the pool. The array is not ordered and must be freed + * by the caller. + */ +taskq_t * +taskq_create_synced(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int i = 0; + kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, + KM_SLEEP); + + flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); + + /* taskq_create spawns all the threads before returning */ + tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, + flags | TASKQ_PREPOPULATE); + VERIFY(tq != NULL); + VERIFY(tq->tq_nthreads == nthreads); + + list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) { + kthreads[i] = tqt->tqt_thread; + i++; + } + + ASSERT3S(i, ==, nthreads); + *ktpp = kthreads; + + return (tq); +} +EXPORT_SYMBOL(taskq_create_synced); + static unsigned int spl_taskq_kick = 0; /* @@ -1379,7 +1421,7 @@ spl_taskq_init(void) system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) - return (1); + return (-ENOMEM); system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); @@ -1388,7 +1430,7 @@ spl_taskq_init(void) cpuhp_remove_multi_state(spl_taskq_cpuhp_state); #endif taskq_destroy(system_taskq); - return (1); + return (-ENOMEM); } dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, @@ -1399,7 +1441,7 @@ spl_taskq_init(void) #endif taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); - return (1); + return (-ENOMEM); } /* diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c index 32a2d34b1d93..ee3eb4690c3a 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c @@ -26,6 +26,7 @@ #include <sys/thread.h> #include <sys/kmem.h> #include <sys/tsd.h> +#include <sys/string.h> /* * Thread interfaces @@ -92,7 +93,7 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func, return (NULL); } - strncpy(tp->tp_name, name, tp->tp_name_size); + strlcpy(tp->tp_name, name, tp->tp_name_size); /* * Strip trailing "_thread" from passed name which will be the func @@ -178,12 +179,11 @@ issig(int why) sigorsets(&set, &task->blocked, &set); spin_lock_irq(&task->sighand->siglock); - int ret; #ifdef HAVE_DEQUEUE_SIGNAL_4ARG enum pid_type __type; - if ((ret = dequeue_signal(task, &set, &__info, &__type)) != 0) { + if (dequeue_signal(task, &set, &__info, &__type) != 0) { #else - if ((ret = dequeue_signal(task, &set, &__info)) != 0) { + if (dequeue_signal(task, &set, &__info) != 0) { #endif #ifdef HAVE_SIGNAL_STOP spin_unlock_irq(&task->sighand->siglock); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c index 7912a381294d..d3e53e541b8b 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c index 546db9ab8bd7..389c9d0d6df3 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c @@ -706,7 +706,7 @@ spl_tsd_init(void) { tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); if (tsd_hash_table == NULL) - return (1); + return (-ENOMEM); return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c index 6b77524181db..e1773da5d173 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c @@ -25,6 +25,7 @@ #include <sys/debug.h> #include <sys/types.h> #include <sys/sysmacros.h> +#include <rpc/types.h> #include <rpc/xdr.h> /* diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c index 589496da0c78..8c6282ee5d16 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c @@ -204,7 +204,7 @@ spl_zlib_init(void) size, 0, NULL, NULL, NULL, NULL, NULL, KMC_KVMEM); if (!zlib_workspace_cache) - return (1); + return (-ENOMEM); return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c index b8a8b7cd8cd8..d0d0cca154a7 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c @@ -25,18 +25,20 @@ */ #include <sys/types.h> -#include <sys/mutex.h> #include <sys/sysmacros.h> #include <sys/kmem.h> #include <linux/file.h> #include <linux/magic.h> #include <sys/zone.h> +#include <sys/string.h> #if defined(CONFIG_USER_NS) #include <linux/statfs.h> #include <linux/proc_ns.h> #endif +#include <sys/mutex.h> + static kmutex_t zone_datasets_lock; static struct list_head zone_datasets; @@ -49,7 +51,7 @@ typedef struct zone_datasets { typedef struct zone_dataset { struct list_head zd_list; /* zone_dataset linkage */ size_t zd_dsnamelen; /* length of name */ - char zd_dsname[0]; /* name of the member dataset */ + char zd_dsname[]; /* name of the member dataset */ } zone_dataset_t; #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) @@ -203,8 +205,7 @@ zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd) zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP); zd->zd_dsnamelen = dsnamelen; - strncpy(zd->zd_dsname, dataset, dsnamelen); - zd->zd_dsname[dsnamelen] = '\0'; + strlcpy(zd->zd_dsname, dataset, dsnamelen + 1); INIT_LIST_HEAD(&zd->zd_list); list_add_tail(&zd->zd_list, &zds->zds_datasets); @@ -415,8 +416,8 @@ spl_zone_fini(void) zone_dataset_t, zd_list); list_del(&zd->zd_list); kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); - put_user_ns(zds->zds_userns); } + put_user_ns(zds->zds_userns); list_del(&zds->zds_list); kmem_free(zds, sizeof (*zds)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 0cd4fa5213d4..cee7410c8833 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ /* @@ -59,9 +60,19 @@ #include <sys/zfs_znode.h> #ifdef _KERNEL #include <linux/kmap_compat.h> +#include <linux/mm_compat.h> #include <linux/scatterlist.h> +#include <linux/version.h> +#endif + +#ifdef _KERNEL +#if defined(MAX_ORDER) +#define ABD_MAX_ORDER (MAX_ORDER) +#elif defined(MAX_PAGE_ORDER) +#define ABD_MAX_ORDER (MAX_PAGE_ORDER) +#endif #else -#define MAX_ORDER 1 +#define ABD_MAX_ORDER (1) #endif typedef struct abd_stats { @@ -71,7 +82,7 @@ typedef struct abd_stats { kstat_named_t abdstat_scatter_cnt; kstat_named_t abdstat_scatter_data_size; kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; kstat_named_t abdstat_scatter_page_multi_chunk; kstat_named_t abdstat_scatter_page_multi_zone; kstat_named_t abdstat_scatter_page_alloc_retry; @@ -132,14 +143,14 @@ static abd_stats_t abd_stats = { { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, }; -struct { +static struct { wmsum_t abdstat_struct_size; wmsum_t abdstat_linear_cnt; wmsum_t abdstat_linear_data_size; wmsum_t abdstat_scatter_cnt; wmsum_t abdstat_scatter_data_size; wmsum_t abdstat_scatter_chunk_waste; - wmsum_t abdstat_scatter_orders[MAX_ORDER]; + wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; wmsum_t abdstat_scatter_page_multi_chunk; wmsum_t abdstat_scatter_page_multi_zone; wmsum_t abdstat_scatter_page_alloc_retry; @@ -222,7 +233,7 @@ abd_free_struct_impl(abd_t *abd) } #ifdef _KERNEL -static unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; +static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; /* * Mark zfs data pages so they can be excluded from kernel crash dumps @@ -272,18 +283,21 @@ abd_alloc_chunks(abd_t *abd, size_t size) struct page *page, *tmp_page = NULL; gfp_t gfp = __GFP_NOWARN | GFP_NOIO; gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; - int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); - int nr_pages = abd_chunkcnt_for_bytes(size); - int chunks = 0, zones = 0; + unsigned int max_order = MIN(zfs_abd_scatter_max_order, + ABD_MAX_ORDER - 1); + unsigned int nr_pages = abd_chunkcnt_for_bytes(size); + unsigned int chunks = 0, zones = 0; size_t remaining_size; int nid = NUMA_NO_NODE; - int alloc_pages = 0; + unsigned int alloc_pages = 0; INIT_LIST_HEAD(&pages); + ASSERT3U(alloc_pages, <, nr_pages); + while (alloc_pages < nr_pages) { - unsigned chunk_pages; - int order; + unsigned int chunk_pages; + unsigned int order; order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); chunk_pages = (1U << order); @@ -597,10 +611,8 @@ abd_free_chunks(abd_t *abd) struct scatterlist *sg; abd_for_each_sg(abd, sg, n, i) { - for (int j = 0; j < sg->length; j += PAGESIZE) { - struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); - umem_free(p, PAGESIZE); - } + struct page *p = nth_page(sg_page(sg), 0); + umem_free_aligned(p, PAGESIZE); } abd_free_sg_table(abd); } @@ -706,7 +718,7 @@ abd_free_zero_scatter(void) __free_page(abd_zero_page); #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ #else - umem_free(abd_zero_page, PAGESIZE); + umem_free_aligned(abd_zero_page, PAGESIZE); #endif /* _KERNEL */ } @@ -729,7 +741,7 @@ abd_kstats_update(kstat_t *ksp, int rw) wmsum_value(&abd_sums.abdstat_scatter_data_size); as->abdstat_scatter_chunk_waste.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); - for (int i = 0; i < MAX_ORDER; i++) { + for (int i = 0; i < ABD_MAX_ORDER; i++) { as->abdstat_scatter_orders[i].value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_orders[i]); } @@ -758,7 +770,7 @@ abd_init(void) wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); - for (i = 0; i < MAX_ORDER; i++) + for (i = 0; i < ABD_MAX_ORDER; i++) wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); @@ -768,7 +780,7 @@ abd_init(void) abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { - for (i = 0; i < MAX_ORDER; i++) { + for (i = 0; i < ABD_MAX_ORDER; i++) { snprintf(abd_stats.abdstat_scatter_orders[i].name, KSTAT_STRLEN, "scatter_order_%d", i); abd_stats.abdstat_scatter_orders[i].data_type = @@ -798,7 +810,7 @@ abd_fini(void) wmsum_fini(&abd_sums.abdstat_scatter_cnt); wmsum_fini(&abd_sums.abdstat_scatter_data_size); wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); - for (int i = 0; i < MAX_ORDER; i++) + for (int i = 0; i < ABD_MAX_ORDER; i++) wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); @@ -886,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { + if (!abd_is_linear(abd)) { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } @@ -906,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) boolean_t abd_iter_at_end(struct abd_iter *aiter) { + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); return (aiter->iter_pos == aiter->iter_abd->abd_size); } @@ -917,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { + /* + * Ensure that last chunk is not in use. abd_iterate_*() must clear + * this state (directly or abd_iter_unmap()) before advancing. + */ ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); + ASSERT3P(aiter->iter_page, ==, NULL); + ASSERT0(aiter->iter_page_doff); + ASSERT0(aiter->iter_page_dsize); /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) @@ -1000,6 +1015,134 @@ abd_cache_reap_now(void) } #if defined(_KERNEL) + +/* + * This is abd_iter_page(), the function underneath abd_iterate_page_func(). + * It yields the next page struct and data offset and size within it, without + * mapping it into the address space. + */ + +/* + * "Compound pages" are a group of pages that can be referenced from a single + * struct page *. Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we + * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a + * great many of the IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGESIZE pages, and can be safely used + * as-is. However, the head page has length covering itself and all the tail + * pages. If the ABD chunk spans multiple pages, then we can use the head page + * and a >PAGESIZE length, which is far more efficient. + * + * Before kernel 4.5 however, compound page heads were refcounted separately + * from tail pages, such that moving back to the head page would require us to + * take a reference to it and releasing it once we're completely finished with + * it. In practice, that means when our caller is done with the ABD, which we + * have no insight into from here. Rather than contort this API to track head + * page references on such ancient kernels, we disable this special compound + * page handling on 4.5, instead just using treating each page within it as a + * regular PAGESIZE page (which it is). This is slightly less efficient, but + * makes everything far simpler. + * + * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the + * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to + * understand compound pages, or not, as required. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define ABD_ITER_COMPOUND_PAGES 1 +#define ABD_ITER_PAGE_SIZE(page) \ + (PageCompound(page) ? page_size(page) : PAGESIZE) +#else +#undef ABD_ITER_COMPOUND_PAGES +#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) +#endif + +void +abd_iter_page(struct abd_iter *aiter) +{ + if (abd_iter_at_end(aiter)) { + aiter->iter_page = NULL; + aiter->iter_page_doff = 0; + aiter->iter_page_dsize = 0; + return; + } + + struct page *page; + size_t doff, dsize; + + /* + * Find the page, and the start of the data within it. This is computed + * differently for linear and scatter ABDs; linear is referenced by + * virtual memory location, while scatter is referenced by page + * pointer. + */ + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + + /* memory address at iter_pos */ + void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + + /* struct page for address */ + page = is_vmalloc_addr(paddr) ? + vmalloc_to_page(paddr) : virt_to_page(paddr); + + /* offset of address within the page */ + doff = offset_in_page(paddr); + } else { + ASSERT(!abd_is_gang(aiter->iter_abd)); + + /* current scatter page */ + page = nth_page(sg_page(aiter->iter_sg), + aiter->iter_offset >> PAGE_SHIFT); + + /* position within page */ + doff = aiter->iter_offset & (PAGESIZE - 1); + } + +#ifdef ABD_ITER_COMPOUND_PAGES + if (PageTail(page)) { + /* + * If this is a compound tail page, move back to the head, and + * adjust the offset to match. This may let us yield a much + * larger amount of data from a single logical page, and so + * leave our caller with fewer pages to process. + */ + struct page *head = compound_head(page); + doff += ((page - head) * PAGESIZE); + page = head; + } +#endif + + ASSERT(page); + + /* + * Compute the maximum amount of data we can take from this page. This + * is the smaller of: + * - the remaining space in the page + * - the remaining space in this scatterlist entry (which may not cover + * the entire page) + * - the remaining space in the abd (which may not cover the entire + * scatterlist entry) + */ + dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, + aiter->iter_abd->abd_size - aiter->iter_pos); + if (!abd_is_linear(aiter->iter_abd)) + dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); + ASSERT3U(dsize, >, 0); + + /* final iterator outputs */ + aiter->iter_page = page; + aiter->iter_page_doff = doff; + aiter->iter_page_dsize = dsize; +} + +/* + * Note: ABD BIO functions only needed to support vdev_classic. See comments in + * vdev_disk.c. + */ + /* * bio_nr_pages for ABD. * @off is the offset in @abd @@ -1154,4 +1297,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); -#endif + +#endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index a95e9c334af9..02dd80c06062 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -80,12 +80,18 @@ static struct notifier_block arc_hotplug_callback_mem_nb; /* * Return a default max arc size based on the amount of physical memory. + * This may be overridden by tuning the zfs_arc_max module parameter. */ uint64_t arc_default_max(uint64_t min, uint64_t allmem) { - /* Default to 1/2 of all memory. */ - return (MAX(allmem / 2, min)); + uint64_t size; + + if (allmem >= 1 << 30) + size = allmem - (1 << 30); + else + size = min; + return (MAX(allmem * 5 / 8, size)); } #ifdef _KERNEL @@ -219,7 +225,11 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) arc_reduce_target_size(ptob(sc->nr_to_scan)); arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += sc->nr_to_scan; +#else current->reclaim_state->reclaimed_slab += sc->nr_to_scan; +#endif /* * We are experiencing memory pressure which the arc_evict_zthr was @@ -243,8 +253,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) return (sc->nr_to_scan); } -SPL_SHRINKER_DECLARE(arc_shrinker, - arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); +static struct shrinker *arc_shrinker = NULL; int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) @@ -347,22 +356,26 @@ arc_lowmem_init(void) * reclaim from the arc. This is done to prevent kswapd from * swapping out pages when it is preferable to shrink the arc. */ - spl_register_shrinker(&arc_shrinker); + arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", + arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + VERIFY(arc_shrinker); + arc_set_sys_free(allmem); } void arc_lowmem_fini(void) { - spl_unregister_shrinker(&arc_shrinker); + spl_unregister_shrinker(arc_shrinker); + arc_shrinker = NULL; } int -param_set_arc_long(const char *buf, zfs_kernel_param_t *kp) +param_set_arc_u64(const char *buf, zfs_kernel_param_t *kp) { int error; - error = param_set_long(buf, kp); + error = spl_param_set_u64(buf, kp); if (error < 0) return (SET_ERROR(error)); @@ -374,13 +387,13 @@ param_set_arc_long(const char *buf, zfs_kernel_param_t *kp) int param_set_arc_min(const char *buf, zfs_kernel_param_t *kp) { - return (param_set_arc_long(buf, kp)); + return (param_set_arc_u64(buf, kp)); } int param_set_arc_max(const char *buf, zfs_kernel_param_t *kp) { - return (param_set_arc_long(buf, kp)); + return (param_set_arc_u64(buf, kp)); } int @@ -485,56 +498,5 @@ arc_unregister_hotplug(void) } #endif /* _KERNEL */ -/* - * Helper function for arc_prune_async() it is responsible for safely - * handling the execution of a registered arc_prune_func_t. - */ -static void -arc_prune_task(void *ptr) -{ - arc_prune_t *ap = (arc_prune_t *)ptr; - arc_prune_func_t *func = ap->p_pfunc; - - if (func != NULL) - func(ap->p_adjust, ap->p_private); - - zfs_refcount_remove(&ap->p_refcnt, func); -} - -/* - * Notify registered consumers they must drop holds on a portion of the ARC - * buffered they reference. This provides a mechanism to ensure the ARC can - * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This - * is analogous to dnlc_reduce_cache() but more generic. - * - * This operation is performed asynchronously so it may be safely called - * in the context of the arc_reclaim_thread(). A reference is taken here - * for each registered arc_prune_t and the arc_prune_task() is responsible - * for releasing it once the registered arc_prune_func_t has completed. - */ -void -arc_prune_async(int64_t adjust) -{ - arc_prune_t *ap; - - mutex_enter(&arc_prune_mtx); - for (ap = list_head(&arc_prune_list); ap != NULL; - ap = list_next(&arc_prune_list, ap)) { - - if (zfs_refcount_count(&ap->p_refcnt) >= 2) - continue; - - zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); - ap->p_adjust = adjust; - if (taskq_dispatch(arc_prune_taskq, arc_prune_task, - ap, TQ_SLEEP) == TASKQID_INVALID) { - zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); - continue; - } - ARCSTAT_BUMP(arcstat_prune); - } - mutex_exit(&arc_prune_mtx); -} - ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c index ff3ef1bf6ad9..7e5bd392437e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -30,7 +30,7 @@ param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp) { int ret; - ret = param_set_ulong(val, kp); + ret = spl_param_set_u64(val, kp); if (ret < 0) return (ret); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c index ab00d2ae14d2..5d1b4383412a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -124,7 +124,7 @@ secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) if (crgetuid(cr) == owner) return (0); - if (zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (0); #if defined(CONFIG_USER_NS) @@ -214,8 +214,10 @@ secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr, * Determine that subject can set the file setgid flag. */ int -secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) +secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid, zidmap_t *mnt_ns, + struct user_namespace *fs_ns) { + gid = zfs_gid_to_vfsgid(mnt_ns, fs_ns, gid); #if defined(CONFIG_USER_NS) if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) return (EPERM); @@ -284,8 +286,11 @@ secpolicy_setid_clear(vattr_t *vap, cred_t *cr) * Determine that subject can set the file setid flags. */ static int -secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) +secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner, zidmap_t *mnt_ns, + struct user_namespace *fs_ns) { + owner = zfs_uid_to_vfsuid(mnt_ns, fs_ns, owner); + if (crgetuid(cr) == owner) return (0); @@ -310,13 +315,14 @@ secpolicy_vnode_stky_modify(const cred_t *cr) int secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, - const vattr_t *ovap, cred_t *cr) + const vattr_t *ovap, cred_t *cr, zidmap_t *mnt_ns, + struct user_namespace *fs_ns) { int error; if ((vap->va_mode & S_ISUID) != 0 && (error = secpolicy_vnode_setid_modify(cr, - ovap->va_uid)) != 0) { + ovap->va_uid, mnt_ns, fs_ns)) != 0) { return (error); } @@ -334,7 +340,8 @@ secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, * group-id bit. */ if ((vap->va_mode & S_ISGID) != 0 && - secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { + secpolicy_vnode_setids_setgids(cr, ovap->va_gid, + mnt_ns, fs_ns) != 0) { vap->va_mode &= ~S_ISGID; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat.c b/sys/contrib/openzfs/module/os/linux/zfs/qat.c index 08613b3a2042..07e0cafabb0e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/qat.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/qat.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c index 1d099c95bc7c..6d0595dd5f76 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -193,7 +193,9 @@ qat_dc_init(void) sd.huffType = CPA_DC_HT_FULL_DYNAMIC; sd.sessDirection = CPA_DC_DIR_COMBINED; sd.sessState = CPA_DC_STATELESS; +#if (CPA_DC_API_VERSION_NUM_MAJOR == 1 && CPA_DC_API_VERSION_NUM_MINOR < 6) sd.deflateWindowSize = 7; +#endif sd.checksum = CPA_DC_ADLER32; status = cpaDcGetSessionSize(dc_inst_handles[i], &sd, &sess_size, &ctx_size); @@ -247,7 +249,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, Cpa8U *buffer_meta_src = NULL; Cpa8U *buffer_meta_dst = NULL; Cpa32U buffer_meta_size = 0; - CpaDcRqResults dc_results; + CpaDcRqResults dc_results = {.checksum = 1}; CpaStatus status = CPA_STATUS_FAIL; Cpa32U hdr_sz = 0; Cpa32U compressed_sz; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c index 18b6e38d1a6e..0523a23c61e1 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c index cbdc0f350ad8..c8cbedcd5157 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -60,7 +60,7 @@ param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp) { int error; - error = param_set_ulong(val, kp); + error = spl_param_set_u64(val, kp); if (error < 0) return (SET_ERROR(error)); @@ -74,7 +74,7 @@ param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp) { int error; - error = param_set_ulong(val, kp); + error = spl_param_set_u64(val, kp); if (error < 0) return (SET_ERROR(error)); @@ -103,6 +103,18 @@ param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp) return (0); } +int +param_set_active_allocator(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = -param_set_active_allocator_common(val); + if (error == 0) + error = param_set_charp(val, kp); + + return (error); +} + const char * spa_history_zone(void) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/trace.c b/sys/contrib/openzfs/module/os/linux/zfs/trace.c index a690822ae14c..32a188d169e3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/trace.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/trace.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 9a382261df73..2cea61a6294c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -24,6 +24,7 @@ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, 2024, Klara Inc. */ #include <sys/zfs_context.h> @@ -41,12 +42,49 @@ #include <linux/blk-cgroup.h> #endif +/* + * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying + * block_device. Since it carries the block_device inside, its convenient to + * just use the handle as a proxy. + * + * Linux 6.9.x uses a file for the same purpose. + * + * For pre-6.8, we just emulate this with a cast, since we don't need any of + * the other fields inside the handle. + */ +#if defined(HAVE_BDEV_OPEN_BY_PATH) +typedef struct bdev_handle zfs_bdev_handle_t; +#define BDH_BDEV(bdh) ((bdh)->bdev) +#define BDH_IS_ERR(bdh) (IS_ERR(bdh)) +#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) +#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) +typedef struct file zfs_bdev_handle_t; +#define BDH_BDEV(bdh) (file_bdev(bdh)) +#define BDH_IS_ERR(bdh) (IS_ERR(bdh)) +#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) +#else +typedef void zfs_bdev_handle_t; +#define BDH_BDEV(bdh) ((struct block_device *)bdh) +#define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) +#define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) +#endif + typedef struct vdev_disk { - struct block_device *vd_bdev; + zfs_bdev_handle_t *vd_bdh; krwlock_t vd_lock; } vdev_disk_t; /* + * Maximum number of segments to add to a bio (min 4). If this is higher than + * the maximum allowed by the device queue or the kernel itself, it will be + * clamped. Setting it to zero will cause the kernel's ideal size to be used. + */ +uint_t zfs_vdev_disk_max_segs = 0; + +/* * Unique identifier for the exclusive vdev holder. */ static void *zfs_vdev_holder = VDEV_HOLDER; @@ -56,7 +94,7 @@ static void *zfs_vdev_holder = VDEV_HOLDER; * device is missing. The missing path may be transient since the links * can be briefly removed and recreated in response to udev events. */ -static unsigned zfs_vdev_open_timeout_ms = 1000; +static uint_t zfs_vdev_open_timeout_ms = 1000; /* * Size of the "reserved" partition, in blocks. @@ -64,28 +102,46 @@ static unsigned zfs_vdev_open_timeout_ms = 1000; #define EFI_MIN_RESV_SIZE (16 * 1024) /* - * Virtual device vector for disks. + * BIO request failfast mask. */ -typedef struct dio_request { - zio_t *dr_zio; /* Parent ZIO */ - atomic_t dr_ref; /* References */ - int dr_error; /* Bio error */ - int dr_bio_count; /* Count of bio's */ - struct bio *dr_bio[0]; /* Attached bio's */ -} dio_request_t; -static fmode_t -vdev_bdev_mode(spa_mode_t spa_mode) +static unsigned int zfs_vdev_failfast_mask = 1; + +/* + * Convert SPA mode flags into bdev open mode flags. + */ +#ifdef HAVE_BLK_MODE_T +typedef blk_mode_t vdev_bdev_mode_t; +#define VDEV_BDEV_MODE_READ BLK_OPEN_READ +#define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE +#define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL +#define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) +#else +typedef fmode_t vdev_bdev_mode_t; +#define VDEV_BDEV_MODE_READ FMODE_READ +#define VDEV_BDEV_MODE_WRITE FMODE_WRITE +#define VDEV_BDEV_MODE_EXCL FMODE_EXCL +#define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) +#endif + +static vdev_bdev_mode_t +vdev_bdev_mode(spa_mode_t smode) { - fmode_t mode = 0; + ASSERT3U(smode, !=, SPA_MODE_UNINIT); + ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); - if (spa_mode & SPA_MODE_READ) - mode |= FMODE_READ; + vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; - if (spa_mode & SPA_MODE_WRITE) - mode |= FMODE_WRITE; + if (smode & SPA_MODE_READ) + bmode |= VDEV_BDEV_MODE_READ; - return (mode); + if (smode & SPA_MODE_WRITE) + bmode |= VDEV_BDEV_MODE_WRITE; + + ASSERT(bmode & VDEV_BDEV_MODE_MASK); + ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); + + return (bmode); } /* @@ -105,6 +161,16 @@ bdev_whole(struct block_device *bdev) } #endif +#if defined(HAVE_BDEVNAME) +#define vdev_bdevname(bdev, name) bdevname(bdev, name) +#else +static inline void +vdev_bdevname(struct block_device *bdev, char *name) +{ + snprintf(name, BDEVNAME_SIZE, "%pg", bdev); +} +#endif + /* * Returns the maximum expansion capacity of the block device (in bytes). * @@ -163,18 +229,60 @@ vdev_disk_error(zio_t *zio) * which is safe from any context. */ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " - "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), + "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), zio->io_vd->vdev_path, zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); } +static void +vdev_disk_kobj_evt_post(vdev_t *v) +{ + vdev_disk_t *vd = v->vdev_tsd; + if (vd && vd->vd_bdh) { + spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); + } else { + vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", + v->vdev_path); + } +} + +static zfs_bdev_handle_t * +vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) +{ + vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); + +#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) + return (bdev_file_open_by_path(path, bmode, holder, NULL)); +#elif defined(HAVE_BDEV_OPEN_BY_PATH) + return (bdev_open_by_path(path, bmode, holder, NULL)); +#elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) + return (blkdev_get_by_path(path, bmode, holder, NULL)); +#else + return (blkdev_get_by_path(path, bmode, holder)); +#endif +} + +static void +vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) +{ +#if defined(HAVE_BDEV_RELEASE) + return (bdev_release(bdh)); +#elif defined(HAVE_BLKDEV_PUT_HOLDER) + return (blkdev_put(BDH_BDEV(bdh), holder)); +#elif defined(HAVE_BLKDEV_PUT) + return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); +#else + fput(bdh); +#endif +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { - struct block_device *bdev; - fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + zfs_bdev_handle_t *bdh; + spa_mode_t smode = spa_mode(v->vdev_spa); hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; @@ -199,12 +307,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, boolean_t reread_part = B_FALSE; rw_enter(&vd->vd_lock, RW_WRITER); - bdev = vd->vd_bdev; - vd->vd_bdev = NULL; + bdh = vd->vd_bdh; + vd->vd_bdh = NULL; - if (bdev) { + if (bdh) { + struct block_device *bdev = BDH_BDEV(bdh); if (v->vdev_expanding && bdev != bdev_whole(bdev)) { - bdevname(bdev_whole(bdev), disk_name + 5); + vdev_bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition @@ -224,15 +333,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } - blkdev_put(bdev, mode | FMODE_EXCL); + vdev_blkdev_put(bdh, smode, zfs_vdev_holder); } if (reread_part) { - bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, + bdh = vdev_blkdev_get_by_path(disk_name, smode, zfs_vdev_holder); - if (!IS_ERR(bdev)) { - int error = vdev_bdev_reread_part(bdev); - blkdev_put(bdev, mode | FMODE_EXCL); + if (!BDH_IS_ERR(bdh)) { + int error = + vdev_bdev_reread_part(BDH_BDEV(bdh)); + vdev_blkdev_put(bdh, smode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); @@ -275,58 +385,67 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); - bdev = ERR_PTR(-ENXIO); - while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { - bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, + bdh = BDH_ERR_PTR(-ENXIO); + while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { + bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, zfs_vdev_holder); - if (unlikely(PTR_ERR(bdev) == -ENOENT)) { + if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { + /* + * There is no point of waiting since device is removed + * explicitly + */ + if (v->vdev_removed) + break; + schedule_timeout(MSEC_TO_TICK(10)); - } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { + } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; - } else if (IS_ERR(bdev)) { + } else if (BDH_IS_ERR(bdh)) { break; } } - if (IS_ERR(bdev)) { - int error = -PTR_ERR(bdev); + if (BDH_IS_ERR(bdh)) { + int error = -BDH_PTR_ERR(bdh); vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, (u_longlong_t)(gethrtime() - start), (u_longlong_t)timeout); - vd->vd_bdev = NULL; + vd->vd_bdh = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); return (SET_ERROR(error)); } else { - vd->vd_bdev = bdev; + vd->vd_bdh = bdh; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); } + struct block_device *bdev = BDH_BDEV(vd->vd_bdh); + /* Determine the physical block size */ - int physical_block_size = bdev_physical_block_size(vd->vd_bdev); + int physical_block_size = bdev_physical_block_size(bdev); /* Determine the logical block size */ - int logical_block_size = bdev_logical_block_size(vd->vd_bdev); + int logical_block_size = bdev_logical_block_size(bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; /* Set when device reports it supports TRIM. */ - v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); + v->vdev_has_trim = bdev_discard_supported(bdev); /* Set when device reports it supports secure TRIM. */ - v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); + v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); /* Inform the ZIO pipeline that we are non-rotational */ - v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); + v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); /* Physical volume size in bytes for the partition */ - *psize = bdev_capacity(vd->vd_bdev); + *psize = bdev_capacity(bdev); /* Physical volume size in bytes including possible expansion space */ - *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); + *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *physical_ashift = highbit64(MAX(physical_block_size, @@ -346,98 +465,15 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdev != NULL) { - blkdev_put(vd->vd_bdev, - vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); - } + if (vd->vd_bdh != NULL) + vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), + zfs_vdev_holder); rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } -static dio_request_t * -vdev_disk_dio_alloc(int bio_count) -{ - dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + - sizeof (struct bio *) * bio_count, KM_SLEEP); - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; - - for (int i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - - return (dr); -} - -static void -vdev_disk_dio_free(dio_request_t *dr) -{ - int i; - - for (i = 0; i < dr->dr_bio_count; i++) - if (dr->dr_bio[i]) - bio_put(dr->dr_bio[i]); - - kmem_free(dr, sizeof (dio_request_t) + - sizeof (struct bio *) * dr->dr_bio_count); -} - -static void -vdev_disk_dio_get(dio_request_t *dr) -{ - atomic_inc(&dr->dr_ref); -} - -static int -vdev_disk_dio_put(dio_request_t *dr) -{ - int rc = atomic_dec_return(&dr->dr_ref); - - /* - * Free the dio_request when the last reference is dropped and - * ensure zio_interpret is called only once with the correct zio - */ - if (rc == 0) { - zio_t *zio = dr->dr_zio; - int error = dr->dr_error; - - vdev_disk_dio_free(dr); - - if (zio) { - zio->io_error = error; - ASSERT3S(zio->io_error, >=, 0); - if (zio->io_error) - vdev_disk_error(zio); - - zio_delay_interrupt(zio); - } - } - - return (rc); -} - -BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) -{ - dio_request_t *dr = bio->bi_private; - int rc; - - if (dr->dr_error == 0) { -#ifdef HAVE_1ARG_BIO_END_IO_T - dr->dr_error = BIO_END_IO_ERROR(bio); -#else - if (error) - dr->dr_error = -(error); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - dr->dr_error = EIO; -#endif - } - - /* Drop reference acquired by __vdev_disk_physio */ - rc = vdev_disk_dio_put(dr); -} - static inline void vdev_submit_bio_impl(struct bio *bio) { @@ -589,8 +625,467 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, return (bio); } +static inline uint_t +vdev_bio_max_segs(struct block_device *bdev) +{ + /* + * Smallest of the device max segs and the tuneable max segs. Minimum + * 4, so there's room to finish split pages if they come up. + */ + const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev)); + const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ? + MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs; + const uint_t max_segs = MIN(tune_max_segs, dev_max_segs); + +#ifdef HAVE_BIO_MAX_SEGS + return (bio_max_segs(max_segs)); +#else + return (MIN(max_segs, BIO_MAX_PAGES)); +#endif +} + +static inline uint_t +vdev_bio_max_bytes(struct block_device *bdev) +{ + return (queue_max_sectors(bdev_get_queue(bdev)) << 9); +} + + +/* + * Virtual block IO object (VBIO) + * + * Linux block IO (BIO) objects have a limit on how many data segments (pages) + * they can hold. Depending on how they're allocated and structured, a large + * ZIO can require more than one BIO to be submitted to the kernel, which then + * all have to complete before we can return the completed ZIO back to ZFS. + * + * A VBIO is a wrapper around multiple BIOs, carrying everything needed to + * translate a ZIO down into the kernel block layer and back again. + * + * Note that these are only used for data ZIOs (read/write). Meta-operations + * (flush/trim) don't need multiple BIOs and so can just make the call + * directly. + */ +typedef struct { + zio_t *vbio_zio; /* parent zio */ + + struct block_device *vbio_bdev; /* blockdev to submit bios to */ + + abd_t *vbio_abd; /* abd carrying borrowed linear buf */ + + uint_t vbio_max_segs; /* max segs per bio */ + + uint_t vbio_max_bytes; /* max bytes per bio */ + uint_t vbio_lbs_mask; /* logical block size mask */ + + uint64_t vbio_offset; /* start offset of next bio */ + + struct bio *vbio_bio; /* pointer to the current bio */ + int vbio_flags; /* bio flags */ +} vbio_t; + +static vbio_t * +vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) +{ + vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP); + + vbio->vbio_zio = zio; + vbio->vbio_bdev = bdev; + vbio->vbio_abd = NULL; + vbio->vbio_max_segs = vdev_bio_max_segs(bdev); + vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev); + vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1); + vbio->vbio_offset = zio->io_offset; + vbio->vbio_bio = NULL; + vbio->vbio_flags = flags; + + return (vbio); +} + +BIO_END_IO_PROTO(vbio_completion, bio, error); + +static int +vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset) +{ + struct bio *bio = vbio->vbio_bio; + uint_t ssize; + + while (size > 0) { + if (bio == NULL) { + /* New BIO, allocate and set up */ + bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO, + vbio->vbio_max_segs); + VERIFY(bio); + + BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9; + bio_set_op_attrs(bio, + vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ? + WRITE : READ, vbio->vbio_flags); + + if (vbio->vbio_bio) { + bio_chain(vbio->vbio_bio, bio); + vdev_submit_bio(vbio->vbio_bio); + } + vbio->vbio_bio = bio; + } + + /* + * Only load as much of the current page data as will fit in + * the space left in the BIO, respecting lbs alignment. Older + * kernels will error if we try to overfill the BIO, while + * newer ones will accept it and split the BIO. This ensures + * everything works on older kernels, and avoids an additional + * overhead on the new. + */ + ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) & + vbio->vbio_lbs_mask); + if (ssize > 0 && + bio_add_page(bio, page, ssize, offset) == ssize) { + /* Accepted, adjust and load any remaining. */ + size -= ssize; + offset += ssize; + continue; + } + + /* No room, set up for a new BIO and loop */ + vbio->vbio_offset += BIO_BI_SIZE(bio); + + /* Signal new BIO allocation wanted */ + bio = NULL; + } + + return (0); +} + +/* Iterator callback to submit ABD pages to the vbio. */ +static int +vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv) +{ + vbio_t *vbio = priv; + return (vbio_add_page(vbio, page, len, off)); +} + +/* Create some BIOs, fill them with data and submit them */ +static void +vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) +{ + /* + * We plug so we can submit the BIOs as we go and only unplug them when + * they are fully created and submitted. This is important; if we don't + * plug, then the kernel may start executing earlier BIOs while we're + * still creating and executing later ones, and if the device goes + * away while that's happening, older kernels can get confused and + * trample memory. + */ + struct blk_plug plug; + blk_start_plug(&plug); + + (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); + ASSERT(vbio->vbio_bio); + + vbio->vbio_bio->bi_end_io = vbio_completion; + vbio->vbio_bio->bi_private = vbio; + + /* + * Once submitted, vbio_bio now owns vbio (through bi_private) and we + * can't touch it again. The bio may complete and vbio_completion() be + * called and free the vbio before this task is run again, so we must + * consider it invalid from this point. + */ + vdev_submit_bio(vbio->vbio_bio); + + blk_finish_plug(&plug); +} + +/* IO completion callback */ +BIO_END_IO_PROTO(vbio_completion, bio, error) +{ + vbio_t *vbio = bio->bi_private; + zio_t *zio = vbio->vbio_zio; + + ASSERT(zio); + + /* Capture and log any errors */ +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = 0; + if (error) + zio->io_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + zio->io_error = EIO; +#endif + ASSERT3U(zio->io_error, >=, 0); + + if (zio->io_error) + vdev_disk_error(zio); + + /* Return the BIO to the kernel */ + bio_put(bio); + + /* + * If we copied the ABD before issuing it, clean up and return the copy + * to the ADB, with changes if appropriate. + */ + if (vbio->vbio_abd != NULL) { + void *buf = abd_to_buf(vbio->vbio_abd); + abd_free(vbio->vbio_abd); + vbio->vbio_abd = NULL; + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); + } + + /* Final cleanup */ + kmem_free(vbio, sizeof (vbio_t)); + + /* All done, submit for processing */ + zio_delay_interrupt(zio); +} + +/* + * Iterator callback to count ABD pages and check their size & alignment. + * + * On Linux, each BIO segment can take a page pointer, and an offset+length of + * the data within that page. A page can be arbitrarily large ("compound" + * pages) but we still have to ensure the data portion is correctly sized and + * aligned to the logical block size, to ensure that if the kernel wants to + * split the BIO, the two halves will still be properly aligned. + * + * NOTE: if you change this function, change the copy in + * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test + * data there to validate the change you're making. + * + */ +typedef struct { + uint_t bmask; + uint_t npages; + uint_t end; +} vdev_disk_check_pages_t; + +static int +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +{ + (void) page; + vdev_disk_check_pages_t *s = priv; + + /* + * If we didn't finish on a block size boundary last time, then there + * would be a gap if we tried to use this ABD as-is, so abort. + */ + if (s->end != 0) + return (1); + + /* + * Note if we're taking less than a full block, so we can check it + * above on the next call. + */ + s->end = (off+len) & s->bmask; + + /* All blocks after the first must start on a block size boundary. */ + if (s->npages != 0 && (off & s->bmask) != 0) + return (1); + + s->npages++; + return (0); +} + +/* + * Check if we can submit the pages in this ABD to the kernel as-is. Returns + * the number of pages, or 0 if it can't be submitted like this. + */ +static boolean_t +vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) +{ + vdev_disk_check_pages_t s = { + .bmask = bdev_logical_block_size(bdev)-1, + .npages = 0, + .end = 0, + }; + + if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) + return (B_FALSE); + + return (B_TRUE); +} + +static int +vdev_disk_io_rw(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = BDH_BDEV(vd->vd_bdh); + int flags = 0; + + /* + * Accessing outside the block device is never allowed. + */ + if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) { + vdev_dbgmsg(zio->io_vd, + "Illegal access %llu size %llu, device size %llu", + (u_longlong_t)zio->io_offset, + (u_longlong_t)zio->io_size, + (u_longlong_t)i_size_read(bdev->bd_inode)); + return (SET_ERROR(EIO)); + } + + if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && + v->vdev_failfast == B_TRUE) { + bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, + zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); + } + + /* + * Check alignment of the incoming ABD. If any part of it would require + * submitting a page that is not aligned to the logical block size, + * then we take a copy into a linear buffer and submit that instead. + * This should be impossible on a 512b LBS, and fairly rare on 4K, + * usually requiring abnormally-small data blocks (eg gang blocks) + * mixed into the same ABD as larger ones (eg aggregated). + */ + abd_t *abd = zio->io_abd; + if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { + void *buf; + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + + /* + * Wrap the copy in an abd_t, so we can use the same iterators + * to count and fill the vbio later. + */ + abd = abd_get_from_buf(buf, zio->io_size); + + /* + * False here would mean the borrowed copy has an invalid + * alignment too, which would mean we've somehow been passed a + * linear ABD with an interior page that has a non-zero offset + * or a size not a multiple of PAGE_SIZE. This is not possible. + * It would mean either zio_buf_alloc() or its underlying + * allocators have done something extremely strange, or our + * math in vdev_disk_check_pages() is wrong. In either case, + * something in seriously wrong and its not safe to continue. + */ + VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); + } + + /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ + vbio_t *vbio = vbio_alloc(zio, bdev, flags); + if (abd != zio->io_abd) + vbio->vbio_abd = abd; + + /* Fill it with data pages and submit it to the kernel */ + vbio_submit(vbio, abd, zio->io_size); + return (0); +} + +/* ========== */ + +/* + * This is the classic, battle-tested BIO submission code. Until we're totally + * sure that the new code is safe and correct in all cases, this will remain + * available and can be enabled by setting zfs_vdev_disk_classic=1 at module + * load time. + * + * These functions have been renamed to vdev_classic_* to make it clear what + * they belong to, but their implementations are unchanged. + */ + +/* + * Virtual device vector for disks. + */ +typedef struct dio_request { + zio_t *dr_zio; /* Parent ZIO */ + atomic_t dr_ref; /* References */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[]; /* Attached bio's */ +} dio_request_t; + +static dio_request_t * +vdev_classic_dio_alloc(int bio_count) +{ + dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + + sizeof (struct bio *) * bio_count, KM_SLEEP); + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; + + for (int i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; + + return (dr); +} + +static void +vdev_classic_dio_free(dio_request_t *dr) +{ + int i; + + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count); +} + +static void +vdev_classic_dio_get(dio_request_t *dr) +{ + atomic_inc(&dr->dr_ref); +} + +static void +vdev_classic_dio_put(dio_request_t *dr) +{ + int rc = atomic_dec_return(&dr->dr_ref); + + /* + * Free the dio_request when the last reference is dropped and + * ensure zio_interpret is called only once with the correct zio + */ + if (rc == 0) { + zio_t *zio = dr->dr_zio; + int error = dr->dr_error; + + vdev_classic_dio_free(dr); + + if (zio) { + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); + } + } +} + +BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error) +{ + dio_request_t *dr = bio->bi_private; + + if (dr->dr_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + dr->dr_error = BIO_END_IO_ERROR(bio); +#else + if (error) + dr->dr_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + dr->dr_error = EIO; +#endif + } + + /* Drop reference acquired by vdev_classic_physio */ + vdev_classic_dio_put(dr); +} + static inline unsigned int -vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) +vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) { unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, bio_size, abd_offset); @@ -603,9 +1098,16 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) } static int -__vdev_disk_physio(struct block_device *bdev, zio_t *zio, - size_t io_size, uint64_t io_offset, int rw, int flags) +vdev_classic_physio(zio_t *zio) { + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = BDH_BDEV(vd->vd_bdh); + size_t io_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; + int flags = 0; + dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; @@ -628,10 +1130,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, } retry: - dr = vdev_disk_dio_alloc(bio_count); + dr = vdev_classic_dio_alloc(bio_count); - if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) - bio_set_flags_failfast(bdev, &flags); + if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && + zio->io_vd->vdev_failfast == B_TRUE) { + bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, + zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); + } dr->dr_zio = zio; @@ -660,23 +1165,23 @@ retry: * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); bio_count *= 2; goto retry; } - nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); + nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { - vdev_disk_dio_free(dr); + vdev_classic_dio_free(dr); return (SET_ERROR(ENOMEM)); } - /* Matching put called by vdev_disk_physio_completion */ - vdev_disk_dio_get(dr); + /* Matching put called by vdev_classic_physio_completion */ + vdev_classic_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; - dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); @@ -690,7 +1195,7 @@ retry: } /* Extra reference to protect dio_request during vdev_submit_bio */ - vdev_disk_dio_get(dr); + vdev_classic_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); @@ -704,11 +1209,13 @@ retry: if (dr->dr_bio_count > 1) blk_finish_plug(&plug); - (void) vdev_disk_dio_put(dr); + vdev_classic_dio_put(dr); return (error); } +/* ========== */ + BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; @@ -751,39 +1258,123 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) +{ + zio_t *zio = bio->bi_private; +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = -error; +#endif + bio_put(bio); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} + +/* + * Wrappers for the different secure erase and discard APIs. We use async + * when available; in this case, *biop is set to the last bio in the chain. + */ static int -vdev_disk_io_trim(zio_t *zio) +vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector, + sector_t nsect, struct bio **biop) { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; + *biop = NULL; + int error; #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) { - return (-blkdev_issue_secure_erase(vd->vd_bdev, - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } else { - return (-blkdev_issue_discard(vd->vd_bdev, - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) - unsigned long trim_flags = 0; -#if defined(BLKDEV_DISCARD_SECURE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) - trim_flags |= BLKDEV_DISCARD_SECURE; + error = blkdev_issue_secure_erase(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE); +#else +#error "unsupported kernel" #endif - return (-blkdev_issue_discard(vd->vd_bdev, - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); + + return (error); +} + +static int +vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector, + sector_t nsect, struct bio **biop) +{ + *biop = NULL; + int error; + +#if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, 0, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS) + error = __blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, biop); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS, 0); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS) + error = blkdev_issue_discard(BDH_BDEV(bdh), + sector, nsect, GFP_NOFS); #else -#error "Unsupported kernel" +#error "unsupported kernel" #endif + + return (error); } +/* + * Entry point for TRIM ops. This calls the right wrapper for secure erase or + * discard, and then does the appropriate finishing work for error vs success + * and async vs sync. + */ +static int +vdev_disk_io_trim(zio_t *zio) +{ + int error; + struct bio *bio; + + zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh; + sector_t sector = zio->io_offset >> 9; + sector_t nsects = zio->io_size >> 9; + + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio); + else + error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio); + + if (error != 0) + return (SET_ERROR(-error)); + + if (bio == NULL) { + /* + * This was a synchronous op that completed successfully, so + * return it to ZFS immediately. + */ + zio_interrupt(zio); + } else { + /* + * This was an asynchronous op; set up completion callback and + * issue it. + */ + bio->bi_private = zio; + bio->bi_end_io = vdev_disk_discard_end_io; + vdev_submit_bio(bio); + } + + return (0); +} + +int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL; + static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - int rw, error; + int error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. @@ -801,7 +1392,7 @@ vdev_disk_io_start(zio_t *zio) * If the vdev is closed, it's likely due to a failed reopen and is * in the UNAVAIL state. Nothing to be done here but return failure. */ - if (vd->vd_bdev == NULL) { + if (vd->vd_bdh == NULL) { rw_exit(&vd->vd_lock); zio->io_error = ENXIO; zio_interrupt(zio); @@ -809,74 +1400,72 @@ vdev_disk_io_start(zio_t *zio) } switch (zio->io_type) { - case ZIO_TYPE_IOCTL: + case ZIO_TYPE_FLUSH: if (!vdev_readable(v)) { - rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - if (v->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - - error = vdev_disk_io_flush(vd->vd_bdev, zio); + /* Drive not there, can't flush */ + error = SET_ERROR(ENXIO); + } else if (zfs_nocacheflush) { + /* Flushing disabled by operator, declare success */ + error = 0; + } else if (v->vdev_nowritecache) { + /* This vdev not capable of flushing */ + error = SET_ERROR(ENOTSUP); + } else { + /* + * Issue the flush. If successful, the response will + * be handled in the completion callback, so we're done. + */ + error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } - - zio->io_error = error; - - break; - - default: - zio->io_error = SET_ERROR(ENOTSUP); } + /* Couldn't issue the flush, so set the error and return it */ rw_exit(&vd->vd_lock); + zio->io_error = error; zio_execute(zio); return; - case ZIO_TYPE_WRITE: - rw = WRITE; - break; - - case ZIO_TYPE_READ: - rw = READ; - break; case ZIO_TYPE_TRIM: - zio->io_error = vdev_disk_io_trim(zio); + error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); - zio_interrupt(zio); + if (error) { + zio->io_error = error; + zio_execute(zio); + } return; - default: + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + error = vdev_disk_io_rw_fn(zio); rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENOTSUP); - zio_interrupt(zio); + if (error) { + zio->io_error = error; + zio_interrupt(zio); + } return; - } - zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, - zio->io_size, zio->io_offset, rw, 0); - rw_exit(&vd->vd_lock); + default: + /* + * Getting here means our parent vdev has made a very strange + * request of us, and shouldn't happen. Assert here to force a + * crash in dev builds, but in production return the IO + * unhandled. The pool will likely suspend anyway but that's + * nicer than crashing the kernel. + */ + ASSERT3S(zio->io_type, ==, -1); - if (error) { - zio->io_error = error; + rw_exit(&vd->vd_lock); + zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } + + __builtin_unreachable(); } static void @@ -891,8 +1480,8 @@ vdev_disk_io_done(zio_t *zio) vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - if (zfs_check_media_change(vd->vd_bdev)) { - invalidate_bdev(vd->vd_bdev); + if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { + invalidate_bdev(BDH_BDEV(vd->vd_bdh)); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } @@ -925,8 +1514,49 @@ vdev_disk_rele(vdev_t *vd) /* XXX: Implement me as a vnode rele for the device */ } +/* + * BIO submission method. See comment above about vdev_classic. + * Set zfs_vdev_disk_classic=0 for new, =1 for classic + */ +static uint_t zfs_vdev_disk_classic = 0; /* default new */ + +/* Set submission function from module parameter */ +static int +vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp) +{ + int err = param_set_uint(buf, kp); + if (err < 0) + return (SET_ERROR(err)); + + vdev_disk_io_rw_fn = + zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw; + + printk(KERN_INFO "ZFS: forcing %s BIO submission\n", + zfs_vdev_disk_classic ? "classic" : "new"); + + return (0); +} + +/* + * At first use vdev use, set the submission function from the default value if + * it hasn't been set already. + */ +static int +vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + (void) spa; + (void) nv; + (void) tsd; + + if (vdev_disk_io_rw_fn == NULL) + vdev_disk_io_rw_fn = zfs_vdev_disk_classic ? + vdev_classic_physio : vdev_disk_io_rw; + + return (0); +} + vdev_ops_t vdev_disk_ops = { - .vdev_op_init = NULL, + .vdev_op_init = vdev_disk_init, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, @@ -947,7 +1577,8 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ + .vdev_op_leaf = B_TRUE, /* leaf vdev */ + .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post }; /* @@ -976,17 +1607,17 @@ MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { - uint64_t val; + uint_t val; int error; - error = kstrtoull(buf, 0, &val); + error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(-EINVAL)); - error = param_set_ulong(buf, kp); + error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); @@ -996,19 +1627,32 @@ param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) int param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { - uint64_t val; + uint_t val; int error; - error = kstrtoull(buf, 0, &val); + error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(-EINVAL)); - error = param_set_ulong(buf, kp); + error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, + "Timeout before determining that a device is missing"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, + "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); + +ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, + "Maximum number of data segments to add to an IO request (min 4)"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic, + vdev_disk_param_set_classic, param_get_uint, ZMOD_RD, + "Use classic BIO submission method"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c index f073145326e3..ac41a2615f16 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -53,8 +53,8 @@ static taskq_t *vdev_file_taskq; * impact the vdev_ashift setting which can only be set at vdev creation * time. */ -static unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; -static unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; +static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; +static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; static void vdev_file_hold(vdev_t *vd) @@ -242,7 +242,7 @@ vdev_file_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - if (zio->io_type == ZIO_TYPE_IOCTL) { + if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); @@ -250,33 +250,27 @@ vdev_file_io_start(zio_t *zio) return; } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - /* - * We cannot safely call vfs_fsync() when PF_FSTRANS - * is set in the current context. Filesystems like - * XFS include sanity checks to verify it is not - * already set, see xfs_vm_writepage(). Therefore - * the sync must be dispatched to a different context. - */ - if (__spl_pf_fstrans_check()) { - VERIFY3U(taskq_dispatch(vdev_file_taskq, - vdev_file_io_fsync, zio, TQ_SLEEP), !=, - TASKQID_INVALID); - return; - } - - zio->io_error = zfs_file_fsync(vf->vf_file, - O_SYNC | O_DSYNC); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); + if (zfs_nocacheflush) { + zio_execute(zio); + return; } + /* + * We cannot safely call vfs_fsync() when PF_FSTRANS + * is set in the current context. Filesystems like + * XFS include sanity checks to verify it is not + * already set, see xfs_vm_writepage(). Therefore + * the sync must be dispatched to a different context. + */ + if (__spl_pf_fstrans_check()) { + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); + zio_execute(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { @@ -376,7 +370,7 @@ vdev_ops_t vdev_disk_ops = { #endif -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, "Logical ashift for file-based devices"); -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, "Physical ashift for file-based devices"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_label_os.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_label_os.c new file mode 100644 index 000000000000..3d965b89a962 --- /dev/null +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_label_os.c @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2023 by iXsystems, Inc. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> + +/* + * Check if the reserved boot area is in-use. + * + * This function always returns 0, as there are no known external uses + * of the reserved area on Linux. + */ +int +vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd) +{ + (void) spa; + (void) childvd; + + return (0); +} diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index b70691ab31c1..48abbc010917 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -493,10 +493,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); + while ((aclnode = list_remove_head(&aclp->z_acl))) zfs_acl_node_free(aclnode); - } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } @@ -525,7 +523,7 @@ zfs_acl_valid_ace_type(uint_t type, uint_t flags) entry_type == ACE_EVERYONE || entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP); default: - if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + if (type <= MAX_ACE_TYPE) return (B_TRUE); } return (B_FALSE); @@ -629,18 +627,18 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, return (NULL); } -static uint64_t -zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, +static uintptr_t +zfs_ace_walk(void *datap, uintptr_t cookie, int aclcnt, uint16_t *flags, uint16_t *type, uint32_t *mask) { (void) aclcnt; zfs_acl_t *aclp = datap; - zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)cookie; uint64_t who; acep = zfs_acl_next_ace(aclp, acep, &who, mask, flags, type); - return ((uint64_t)(uintptr_t)acep); + return ((uintptr_t)acep); } /* @@ -1163,6 +1161,7 @@ zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, cb->cb_acl_node); } + ASSERT3P(cb->cb_acl_node, !=, NULL); *dataptr = cb->cb_acl_node->z_acldata; *length = cb->cb_acl_node->z_size; } @@ -1284,7 +1283,7 @@ acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) */ static int ace_trivial_common(void *acep, int aclcnt, - uint64_t (*walk)(void *, uint64_t, int aclcnt, + uintptr_t (*walk)(void *, uintptr_t, int, uint16_t *, uint16_t *, uint32_t *)) { uint16_t flags; @@ -1801,7 +1800,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp, */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, - vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zidmap_t *mnt_ns) { int error; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -1888,8 +1887,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, acl_ids->z_mode |= S_ISGID; } else { if ((acl_ids->z_mode & S_ISGID) && - secpolicy_vnode_setids_setgids(cr, gid) != 0) + secpolicy_vnode_setids_setgids(cr, gid, mnt_ns, + zfs_i_user_ns(ZTOI(dzp))) != 0) { acl_ids->z_mode &= ~S_ISGID; + } } if (acl_ids->z_aclp == NULL) { @@ -1920,8 +1921,8 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) trim = B_TRUE; - zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE, - trim, acl_ids->z_aclp); + zfs_acl_chmod(S_ISDIR(vap->va_mode), acl_ids->z_mode, + B_FALSE, trim, acl_ids->z_aclp); } } @@ -1977,7 +1978,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (mask == 0) return (SET_ERROR(ENOSYS)); - if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, + zfs_init_idmap))) return (error); mutex_enter(&zp->z_acl_lock); @@ -2136,7 +2138,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); - if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, + zfs_init_idmap))) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, @@ -2228,8 +2231,7 @@ static int zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && - (!Z_ISDEV(ZTOI(zp)->i_mode) || - (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { + (!Z_ISDEV(ZTOI(zp)->i_mode) || (v4_mode & WRITE_MASK_ATTRS))) { return (SET_ERROR(EROFS)); } @@ -2282,7 +2284,7 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) */ static int zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - boolean_t anyaccess, cred_t *cr) + boolean_t anyaccess, cred_t *cr, zidmap_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_acl_t *aclp; @@ -2298,7 +2300,13 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, uid_t gowner; uid_t fowner; - zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + if (mnt_ns) { + fowner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), + KUID_TO_SUID(ZTOI(zp)->i_uid)); + gowner = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), + KGID_TO_SGID(ZTOI(zp)->i_gid)); + } else + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); @@ -2409,7 +2417,8 @@ zfs_has_access(znode_t *zp, cred_t *cr) { uint32_t have = ACE_ALL_PERMS; - if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr, + zfs_init_idmap) != 0) { uid_t owner; owner = zfs_fuid_map_id(ZTOZSB(zp), @@ -2439,7 +2448,8 @@ zfs_has_access(znode_t *zp, cred_t *cr) * we want to avoid that here. */ static int -zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr) +zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr, + zidmap_t *mnt_ns) { int err, mask; int unmapped = 0; @@ -2452,8 +2462,9 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr) return (unmapped ? SET_ERROR(EPERM) : 0); } -#if defined(HAVE_IOPS_PERMISSION_USERNS) - err = generic_permission(cr->user_ns, ZTOI(zp), mask); +#if (defined(HAVE_IOPS_PERMISSION_USERNS) || \ + defined(HAVE_IOPS_PERMISSION_IDMAP)) + err = generic_permission(mnt_ns, ZTOI(zp), mask); #else err = generic_permission(ZTOI(zp), mask); #endif @@ -2468,7 +2479,7 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr) static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr, zidmap_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int err; @@ -2518,20 +2529,20 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, } if (zp->z_pflags & ZFS_ACL_TRIVIAL) - return (zfs_zaccess_trivial(zp, working_mode, cr)); + return (zfs_zaccess_trivial(zp, working_mode, cr, mnt_ns)); - return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr, mnt_ns)); } static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - cred_t *cr) + cred_t *cr, zidmap_t *mnt_ns) { if (*working_mode != ACE_WRITE_DATA) return (SET_ERROR(EACCES)); return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, - check_privs, B_FALSE, cr)); + check_privs, B_FALSE, cr, mnt_ns)); } int @@ -2566,7 +2577,6 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) } if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) { - owner = B_TRUE; if (zdp->z_mode & S_IXUSR) { mutex_exit(&zdp->z_acl_lock); return (0); @@ -2576,7 +2586,6 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) } } if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) { - groupmbr = B_TRUE; if (zdp->z_mode & S_IXGRP) { mutex_exit(&zdp->z_acl_lock); return (0); @@ -2596,9 +2605,11 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) slow: DTRACE_PROBE(zfs__fastpath__execute__access__miss); - ZFS_ENTER(ZTOZSB(zdp)); - error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); - ZFS_EXIT(ZTOZSB(zdp)); + if ((error = zfs_enter(ZTOZSB(zdp), FTAG)) != 0) + return (error); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, + zfs_init_idmap); + zfs_exit(ZTOZSB(zdp), FTAG); return (error); } @@ -2609,7 +2620,8 @@ slow: * can define any form of access. */ int -zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, + zidmap_t *mnt_ns) { uint32_t working_mode; int error; @@ -2648,8 +2660,10 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } } - owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), - cr, ZFS_OWNER); + owner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), + KUID_TO_SUID(ZTOI(zp)->i_uid)); + owner = zfs_fuid_map_id(ZTOZSB(zp), owner, cr, ZFS_OWNER); + /* * Map the bits required to the standard inode flags * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits @@ -2674,7 +2688,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) needed_bits |= S_IXUSR; if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, - &check_privs, skipaclchk, cr)) == 0) { + &check_privs, skipaclchk, cr, mnt_ns)) == 0) { if (is_attr) zrele(xzp); return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, @@ -2688,7 +2702,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } if (error && (flags & V_APPEND)) { - error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr, + mnt_ns); } if (error && check_privs) { @@ -2699,7 +2714,6 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) * read_acl/read_attributes */ - error = 0; ASSERT(working_mode != 0); if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && @@ -2755,20 +2769,22 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) * NFSv4-style ZFS ACL format and call zfs_zaccess() */ int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr, + zidmap_t *mnt_ns) { - return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr, + mnt_ns)); } /* * Access function for secpolicy_vnode_setattr */ int -zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +zfs_zaccess_unix(void *zp, int mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); - return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, zfs_init_idmap)); } /* See zfs_zaccess_delete() */ @@ -2845,7 +2861,7 @@ static const boolean_t zfs_write_implies_delete_child = B_TRUE; * zfs_write_implies_delete_child */ int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zidmap_t *mnt_ns) { uint32_t wanted_dirperms; uint32_t dzp_working_mode = 0; @@ -2872,7 +2888,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) * (This is part of why we're checking the target first.) */ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, - &zpcheck_privs, B_FALSE, cr); + &zpcheck_privs, B_FALSE, cr, mnt_ns); if (zp_error == EACCES) { /* We hit a DENY ACE. */ if (!zpcheck_privs) @@ -2894,7 +2910,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) if (zfs_write_implies_delete_child) wanted_dirperms |= ACE_WRITE_DATA; dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr, mnt_ns); if (dzp_error == EACCES) { /* We hit a DENY ACE. */ if (!dzpcheck_privs) @@ -2976,7 +2992,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr) + znode_t *tzp, cred_t *cr, zidmap_t *mnt_ns) { int add_perm; int error; @@ -2998,21 +3014,21 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, * If that succeeds then check for add_file/add_subdir permissions */ - if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + if ((error = zfs_zaccess_delete(sdzp, szp, cr, mnt_ns))) return (error); /* * If we have a tzp, see if we can delete it? */ if (tzp) { - if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) + if ((error = zfs_zaccess_delete(tdzp, tzp, cr, mnt_ns))) return (error); } /* * Now check for add permissions */ - error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr, mnt_ns); return (error); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index aae19f6346fd..54ed70d0394f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -392,7 +392,20 @@ zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) zfsctl_snapshot_hold(se); rw_enter(&se->se_taskqid_lock, RW_WRITER); - ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); + /* + * If this condition happens, we managed to: + * - dispatch once + * - want to dispatch _again_ before it returned + * + * So let's just return - if that task fails at unmounting, + * we'll eventually dispatch again, and if it succeeds, + * no problem. + */ + if (se->se_taskqid != TASKQID_INVALID) { + rw_exit(&se->se_taskqid_lock); + zfsctl_snapshot_rele(se); + return; + } se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); rw_exit(&se->se_taskqid_lock); @@ -465,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip) */ static struct inode * zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, - const struct file_operations *fops, const struct inode_operations *ops) + const struct file_operations *fops, const struct inode_operations *ops, + uint64_t creation) { - inode_timespec_t now; struct inode *ip; znode_t *zp; + inode_timespec_t now = {.tv_sec = creation}; ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); - now = current_time(ip); + if (!creation) + now = current_time(ip); zp = ITOZ(ip); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); @@ -485,9 +500,10 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_atime_dirty = B_FALSE; zp->z_zn_prefetch = B_FALSE; zp->z_is_sa = B_FALSE; +#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; +#endif zp->z_is_ctldir = B_TRUE; - zp->z_is_stale = B_FALSE; zp->z_sa_hdl = NULL; zp->z_blksz = 0; zp->z_seq = 0; @@ -504,9 +520,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, ip->i_uid = SUID_TO_KUID(0); ip->i_gid = SGID_TO_KGID(0); ip->i_blkbits = SPA_MINBLOCKSHIFT; - ip->i_atime = now; - ip->i_mtime = now; - ip->i_ctime = now; + zpl_inode_set_atime_to_ts(ip, now); + zpl_inode_set_mtime_to_ts(ip, now); + zpl_inode_set_ctime_to_ts(ip, now); ip->i_fop = fops; ip->i_op = ops; #if defined(IOP_XATTR) @@ -521,7 +537,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); @@ -538,14 +553,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct inode *ip = NULL; + uint64_t creation = 0; + dsl_dataset_t *snap_ds; + dsl_pool_t *pool; while (ip == NULL) { ip = ilookup(zfsvfs->z_sb, (unsigned long)id); if (ip) break; + if (id <= ZFSCTL_INO_SNAPDIRS && !creation) { + pool = dmu_objset_pool(zfsvfs->z_os); + dsl_pool_config_enter(pool, FTAG); + if (!dsl_dataset_hold_obj(pool, + ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) { + creation = dsl_get_creation(snap_ds); + dsl_dataset_rele(snap_ds, FTAG); + } + dsl_pool_config_exit(pool, FTAG); + } + /* May fail due to concurrent zfsctl_inode_alloc() */ - ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops); + ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation); } return (ip); @@ -567,7 +596,7 @@ zfsctl_create(zfsvfs_t *zfsvfs) ASSERT(zfsvfs->z_ctldir == NULL); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, - &zpl_fops_root, &zpl_ops_root); + &zpl_fops_root, &zpl_ops_root, 0); if (zfsvfs->z_ctldir == NULL) return (SET_ERROR(ENOENT)); @@ -673,17 +702,19 @@ zfsctl_fid(struct inode *ip, fid_t *fidp) uint64_t object = zp->z_id; zfid_short_t *zfid; int i; + int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (zfsctl_is_snapdir(ip)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (zfsctl_snapdir_fid(ip, fidp)); } if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } @@ -698,7 +729,7 @@ zfsctl_fid(struct inode *ip, fid_t *fidp) for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -776,7 +807,8 @@ zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, zfsvfs_t *zfsvfs = ITOZSB(dip); int error = 0; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (strcmp(name, "..") == 0) { *ipp = dip->i_sb->s_root->d_inode; @@ -793,7 +825,7 @@ zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, if (*ipp == NULL) error = SET_ERROR(ENOENT); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -810,11 +842,12 @@ zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, uint64_t id; int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); if (error) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -823,7 +856,7 @@ zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, if (*ipp == NULL) error = SET_ERROR(ENOENT); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -844,7 +877,8 @@ zfsctl_snapdir_rename(struct inode *sdip, const char *snm, if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -902,7 +936,7 @@ out: kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -922,7 +956,8 @@ zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr, if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -951,7 +986,7 @@ out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1076,7 +1111,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) return (SET_ERROR(EISDIR)); zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); @@ -1164,7 +1200,7 @@ error: kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); kmem_free(full_path, MAXPATHLEN); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1228,10 +1264,11 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, znode_t *dzp; int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (zfsvfs->z_shares_dir == 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } @@ -1240,7 +1277,7 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, zrele(dzp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c index be65f0a2e245..f707959c9445 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -29,13 +29,13 @@ typedef struct zfs_dbgmsg { procfs_list_node_t zdm_node; uint64_t zdm_timestamp; - int zdm_size; - char zdm_msg[1]; /* variable length allocation */ + uint_t zdm_size; + char zdm_msg[]; /* variable length allocation */ } zfs_dbgmsg_t; static procfs_list_t zfs_dbgmsgs; -static int zfs_dbgmsg_size = 0; -int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +static uint_t zfs_dbgmsg_size = 0; +static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ /* * Internal ZFS debug messages are enabled by default. @@ -68,14 +68,14 @@ zfs_dbgmsg_show(struct seq_file *f, void *p) } static void -zfs_dbgmsg_purge(int max_size) +zfs_dbgmsg_purge(uint_t max_size) { while (zfs_dbgmsg_size > max_size) { zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list); if (zdm == NULL) return; - int size = zdm->zdm_size; + uint_t size = zdm->zdm_size; kmem_free(zdm, size); zfs_dbgmsg_size -= size; } @@ -135,7 +135,7 @@ __set_error(const char *file, const char *func, int line, int err) void __zfs_dbgmsg(char *buf) { - int size = sizeof (zfs_dbgmsg_t) + strlen(buf); + uint_t size = sizeof (zfs_dbgmsg_t) + strlen(buf) + 1; zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); zdm->zdm_size = size; zdm->zdm_timestamp = gethrestime_sec(); @@ -144,7 +144,7 @@ __zfs_dbgmsg(char *buf) mutex_enter(&zfs_dbgmsgs.pl_lock); procfs_list_add(&zfs_dbgmsgs, zdm); zfs_dbgmsg_size += size; - zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + zfs_dbgmsg_purge(zfs_dbgmsg_maxsize); mutex_exit(&zfs_dbgmsgs.pl_lock); } @@ -175,7 +175,8 @@ __dprintf(boolean_t dprint, const char *file, const char *func, newfile = file; } - i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func); + i = snprintf(buf, size, "%px %s%s:%d:%s(): ", + curthread, prefix, newfile, line, func); if (i < size) { va_start(adx, fmt); @@ -252,6 +253,8 @@ zfs_dbgmsg_print(const char *tag) module_param(zfs_dbgmsg_enable, int, 0644); MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); -module_param(zfs_dbgmsg_maxsize, int, 0644); +/* BEGIN CSTYLED */ +module_param(zfs_dbgmsg_maxsize, uint, 0644); +/* END CSTYLED */ MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c index c5b3b5ce7fc0..1eeabe53d23c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -649,6 +649,8 @@ zfs_rmnode(znode_t *zp) objset_t *os = zfsvfs->z_os; znode_t *xzp = NULL; dmu_tx_t *tx; + znode_hold_t *zh; + uint64_t z_id = zp->z_id; uint64_t acl_obj; uint64_t xattr_obj; uint64_t links; @@ -666,8 +668,9 @@ zfs_rmnode(znode_t *zp) * Not enough space to delete some xattrs. * Leave it in the unlinked set. */ + zh = zfs_znode_hold_enter(zfsvfs, z_id); zfs_znode_dmu_fini(zp); - + zfs_znode_hold_exit(zfsvfs, zh); return; } } @@ -686,7 +689,9 @@ zfs_rmnode(znode_t *zp) * Not enough space or we were interrupted by unmount. * Leave the file in the unlinked set. */ + zh = zfs_znode_hold_enter(zfsvfs, z_id); zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); return; } } @@ -726,7 +731,9 @@ zfs_rmnode(znode_t *zp) * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); + zh = zfs_znode_hold_enter(zfsvfs, z_id); zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); goto out; } @@ -926,6 +933,74 @@ zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, return (error); } +static int +zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[3]; + uint64_t mtime[2], ctime[2]; + uint64_t links; + int count = 0; + int error; + + if (zp_is_dir && !zfs_dirempty(zp)) + return (SET_ERROR(ENOTEMPTY)); + + if (ZTOI(zp)->i_nlink <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %lu is %u, " + "should be at least %u", zp->z_id, + (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); + set_nlink(ZTOI(zp), zp_is_dir + 1); + } + drop_nlink(ZTOI(zp)); + if (ZTOI(zp)->i_nlink == zp_is_dir) { + zp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(zp)); + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + links = ZTOI(zp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT3U(error, ==, 0); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Forcefully drop an nlink reference from (zp) and mark it for deletion if it + * was the last link. This *must* only be done to znodes which have already + * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in + * the error path of zfs_rename(), where we have to correct the nlink count if + * we failed to link the target as well as failing to re-link the original + * znodes. + */ +int +zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) +{ + int error; + + mutex_enter(&zp->z_lock); + error = zfs_drop_nlink_locked(zp, tx, unlinkedp); + mutex_exit(&zp->z_lock); + + return (error); +} + /* * Unlink zp from dl, and mark zp for deletion if this was the last link. Can * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). @@ -966,31 +1041,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, return (error); } - if (ZTOI(zp)->i_nlink <= zp_is_dir) { - zfs_panic_recover("zfs: link count on %lu is %u, " - "should be at least %u", zp->z_id, - (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); - set_nlink(ZTOI(zp), zp_is_dir + 1); - } - drop_nlink(ZTOI(zp)); - if (ZTOI(zp)->i_nlink == zp_is_dir) { - zp->z_unlinked = B_TRUE; - clear_nlink(ZTOI(zp)); - unlinked = B_TRUE; - } else { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, sizeof (zp->z_pflags)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, - ctime); - } - links = ZTOI(zp)->i_nlink; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &links, sizeof (links)); - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - count = 0; - ASSERT(error == 0); + /* The only error is !zfs_dirempty() and we checked earlier. */ + error = zfs_drop_nlink_locked(zp, tx, &unlinked); + ASSERT3U(error, ==, 0); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); @@ -1066,11 +1119,8 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) *xzpp = NULL; - if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) - return (error); - if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, - &acl_ids)) != 0) + &acl_ids, zfs_init_idmap)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { zfs_acl_ids_free(&acl_ids); @@ -1218,7 +1268,8 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || - zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0) + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, + zfs_init_idmap) == 0) return (0); else return (secpolicy_vnode_remove(cr)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c index e12f7c3ced43..bc753614be27 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -246,7 +246,7 @@ zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { loff_t rc; - if (*offp < 0 || *offp > MAXOFFSET_T) + if (*offp < 0) return (EINVAL); rc = vfs_llseek(fp, *offp, whence); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c index 67b864aa77a9..663474ea49ab 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -135,7 +135,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) vecnum = cmd - ZFS_IOC_FIRST; - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + zc = vmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) { error = -SET_ERROR(EFAULT); @@ -146,7 +146,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) if (error == 0 && rc != 0) error = -SET_ERROR(EFAULT); out: - kmem_free(zc, sizeof (zfs_cmd_t)); + vmem_free(zc, sizeof (zfs_cmd_t)); return (error); } @@ -282,6 +282,8 @@ zfsdev_detach(void) #define ZFS_DEBUG_STR "" #endif +zidmap_t *zfs_init_idmap; + static int openzfs_init_os(void) { @@ -305,6 +307,8 @@ openzfs_init_os(void) printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n"); #endif /* CONFIG_FS_POSIX_ACL */ + zfs_init_idmap = (zidmap_t *)zfs_get_init_idmap(); + return (0); } @@ -369,8 +373,7 @@ MODULE_ALIAS("zcommon"); MODULE_ALIAS("zzstd"); MODULE_DESCRIPTION("ZFS"); MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE("Lua: MIT"); -MODULE_LICENSE("zstd: Dual BSD/GPL"); -MODULE_LICENSE("Dual BSD/GPL"); +MODULE_LICENSE("Dual MIT/GPL"); /* lua */ +MODULE_LICENSE("Dual BSD/GPL"); /* zstd / misc */ MODULE_LICENSE(ZFS_META_LICENSE); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c index eb7c5f6166d2..e2431fe8a803 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -279,11 +279,11 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, for (int i = 0; i < ARRAY_SIZE(type_map); i++) { if (type_map[i].ztm_type & property->pd_types) { - len += snprintf(buf + len, buflen - len, "%s ", - type_map[i].ztm_name); + len += kmem_scnprintf(buf + len, buflen - len, + "%s ", type_map[i].ztm_name); } } - len += snprintf(buf + len, buflen - len, "\n"); + len += kmem_scnprintf(buf + len, buflen - len, "\n"); return (len); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c index abb6dbe67cdf..c2ed67c438c6 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -204,22 +204,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) this_seg_start = orig_loffset; rq_for_each_segment(bv, rq, iter) { - if (uio->iter.bio) { - /* - * If uio->iter.bio is present, then we know we've saved - * uio->iter from a previous call to this function, and - * we can skip ahead in this rq_for_each_segment() loop - * to where we last left off. That way, we don't need - * to iterate over tons of segments we've already - * processed - we can just restore the "saved state". - */ - iter = uio->iter; - bv = uio->bv; - this_seg_start = uio->uio_loffset; - memset(&uio->iter, 0, sizeof (uio->iter)); - continue; - } - /* * Lookup what the logical offset of the last byte of this * segment is. @@ -260,19 +244,6 @@ zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) copied = 1; /* We copied some data */ } - if (n == 0) { - /* - * All done copying. Save our 'iter' value to the uio. - * This allows us to "save our state" and skip ahead in - * the rq_for_each_segment() loop the next time we call - * call zfs_uiomove_bvec_rq() on this uio (which we - * will be doing for any remaining data in the uio). - */ - uio->iter = iter; /* make a copy of the struct data */ - uio->bv = bv; - return (0); - } - this_seg_start = this_seg_end + 1; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index a67ba821d06f..2015c20d7340 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -56,7 +56,6 @@ #include <sys/sunddi.h> #include <sys/dmu_objset.h> #include <sys/dsl_dir.h> -#include <sys/spa_boot.h> #include <sys/objlist.h> #include <sys/zpl.h> #include <linux/vfs_compat.h> @@ -274,8 +273,10 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) * Sync a specific filesystem. */ dsl_pool_t *dp; + int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* @@ -283,14 +284,14 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) * filesystems which may exist on a suspended pool. */ if (spa_suspended(dp->dp_spa)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you @@ -607,7 +608,8 @@ zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, } if (tmp != *val) { - (void) strcpy(setpoint, "temporary"); + if (setpoint) + (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); @@ -783,9 +785,7 @@ zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) } error = zfsvfs_create_impl(zfvp, zfsvfs, os); - if (error != 0) { - dmu_objset_disown(os, B_TRUE, zfsvfs); - } + return (error); } @@ -825,6 +825,7 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) error = zfsvfs_init(zfsvfs, os); if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; zfsvfs_free(zfsvfs); return (error); @@ -848,8 +849,6 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) if (error) return (error); - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all @@ -857,7 +856,11 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) */ if (mounting) { ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); - dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); + error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); + if (error) + return (error); + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, + &zfsvfs->z_kstat.dk_zil_sums); /* * During replay we remove the read only flag to @@ -921,6 +924,10 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) /* restore readonly bit */ if (readonly != 0) readonly_changed_cb(zfsvfs, B_TRUE); + } else { + ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, + &zfsvfs->z_kstat.dk_zil_sums); } /* @@ -1087,7 +1094,8 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp) uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); @@ -1148,7 +1156,7 @@ zfs_statvfs(struct inode *ip, struct kstatfs *statp) err = zfs_statfs_project(zfsvfs, zp, statp, bshift); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -1158,13 +1166,14 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) znode_t *rootzp; int error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *ipp = ZTOI(rootzp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1185,7 +1194,7 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) int objects = 0; int i = 0, j = 0; - zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); mutex_enter(&zfsvfs->z_znodes_lock); while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { @@ -1221,7 +1230,7 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) zrele(zp); } - kmem_free(zp_array, max_array * sizeof (znode_t *)); + vmem_free(zp_array, max_array * sizeof (znode_t *)); return (objects); } @@ -1231,23 +1240,30 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ +#if defined(HAVE_SUPER_BLOCK_S_SHRINK) +#define S_SHRINK(sb) (&(sb)->s_shrink) +#elif defined(HAVE_SUPER_BLOCK_S_SHRINK_PTR) +#define S_SHRINK(sb) ((sb)->s_shrink) +#endif + int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; - struct shrinker *shrinker = &sb->s_shrink; + struct shrinker *shrinker = S_SHRINK(sb); struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, }; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) - if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { + if (shrinker->flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); @@ -1283,7 +1299,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #endif - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "pruning, nr_to_scan=%lu objects=%d error=%d\n", @@ -1320,12 +1336,11 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. + * We can safely check z_all_znodes for being empty because the + * VFS has already blocked operations which add to it. */ int round = 0; - while (zfsvfs->z_nr_znodes > 0) { + while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) @@ -1479,7 +1494,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) * read-only flag, pretend it was set, as done for snapshots. */ if (!canwrite) - vfs->vfs_readonly = true; + vfs->vfs_readonly = B_TRUE; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { @@ -1513,7 +1528,6 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; - sb->s_d_op = &zpl_dentry_operations; /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); @@ -1547,6 +1561,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) error = zfs_root(zfsvfs, &root_inode); if (error) { (void) zfs_umount(sb); + zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ goto out; } @@ -1554,6 +1569,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) { (void) zfs_umount(sb); + zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ error = SET_ERROR(ENOMEM); goto out; } @@ -1651,6 +1667,7 @@ zfs_umount(struct super_block *sb) } zfsvfs_free(zfsvfs); + sb->s_fs_info = NULL; return (0); } @@ -1740,7 +1757,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); } - ZFS_ENTER(zfsvfs); + if ((err = zfs_enter(zfsvfs, FTAG)) != 0) + return (err); /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { @@ -1756,7 +1774,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) */ VERIFY3P(igrab(*ipp), !=, NULL); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1764,14 +1782,14 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } /* Don't export xattr stuff */ if (zp->z_pflags & ZFS_XATTR) { zrele(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } @@ -1786,7 +1804,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); zrele(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } @@ -1794,7 +1812,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) if (*ipp) zfs_znode_update_vfs(ITOZ(*ipp)); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1869,8 +1887,8 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) zp = list_next(&zfsvfs->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { + zpl_d_drop_aliases(ZTOI(zp)); remove_inode_hash(ZTOI(zp)); - zp->z_is_stale = B_TRUE; } /* see comment in zfs_suspend_fs() */ @@ -2041,91 +2059,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) } /* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) - pname = ZPL_VERSION_STR; - else - pname = zfs_prop_to_name(prop); - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_OFF; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - -/* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. @@ -2164,6 +2097,9 @@ zfs_init(void) zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + register_fo_extend(&zpl_file_operations); +#endif } void @@ -2174,6 +2110,9 @@ zfs_fini(void) */ taskq_wait(system_delay_taskq); taskq_wait(system_taskq); +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + unregister_fo_extend(&zpl_file_operations); +#endif unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index d6ff838806eb..1cecad9f7755 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -82,13 +82,13 @@ * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. - * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * This is done avoiding races using zfs_enter(zfsvfs). + * A zfs_exit(zfsvfs) is needed before all returns. Any znodes + * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if - * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the + * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires @@ -107,7 +107,7 @@ * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * - * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). @@ -142,7 +142,7 @@ * * In general, this is how things should be ordered in each vnode op: * - * ZFS_ENTER(zfsvfs); // exit if unmounted + * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need @@ -160,7 +160,7 @@ * goto top; * } * dmu_tx_abort(tx); // abort DMU tx - * ZFS_EXIT(zfsvfs); // finished in zfs + * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does @@ -171,7 +171,7 @@ * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary - * ZFS_EXIT(zfsvfs); // finished in zfs + * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ int @@ -180,22 +180,29 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); /* Honor ZFS_APPENDONLY file attribute */ - if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } - /* Keep a count of the synchronous opens in the znode */ - if (flag & O_SYNC) - atomic_inc_32(&zp->z_sync_cnt); + /* + * Keep a count of the synchronous opens in the znode. On first + * synchronous open we must convert all previous async transactions + * into sync to keep correct ordering. + */ + if (flag & O_SYNC) { + if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) + zil_async_to_sync(zfsvfs->z_log, zp->z_id); + } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -205,56 +212,60 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) + +static int zfs_fillpage(struct inode *ip, struct page *pp); + /* * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Write: If we find a memory mapped page, we write to *both* - * the page and the dmu buffer. + * between the DMU cache and the memory mapped pages. Update all mapped + * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { - struct inode *ip = ZTOI(zp); - struct address_space *mp = ip->i_mapping; - struct page *pp; - uint64_t nbytes; - int64_t off; - void *pb; + struct address_space *mp = ZTOI(zp)->i_mapping; + int64_t off = start & (PAGE_SIZE - 1); - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - nbytes = MIN(PAGE_SIZE - off, len); + uint64_t nbytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); - pb = kmap(pp); - (void) dmu_read(os, zp->z_id, start + off, nbytes, - pb + off, DMU_READ_PREFETCH); + void *pb = kmap(pp); + int error = dmu_read(os, zp->z_id, start + off, + nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); - if (mapping_writably_mapped(mp)) - flush_dcache_page(pp); + if (error) { + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + } - mark_page_accessed(pp); - SetPageUptodate(pp); - ClearPageError(pp); unlock_page(pp); put_page(pp); } @@ -265,38 +276,44 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) } /* - * When a file is memory mapped, we must keep the IO data synchronized - * between the DMU cache and the memory mapped pages. What this means: - * - * On Read: We "read" preferentially from memory mapped pages, - * else we default from the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. + * When a file is memory mapped, we must keep the I/O data synchronized + * between the DMU cache and the memory mapped pages. Preferentially read + * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; - struct page *pp; - int64_t start, off; - uint64_t bytes; + int64_t start = uio->uio_loffset; + int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; - void *pb; - start = uio->uio_loffset; - off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { - bytes = MIN(PAGE_SIZE - off, len); + uint64_t bytes = MIN(PAGE_SIZE - off, len); - pp = find_lock_page(mp, start >> PAGE_SHIFT); + struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { - ASSERT(PageUptodate(pp)); + /* + * If filemap_fault() retries there exists a window + * where the page will be unlocked and not up to date. + * In this case we must try and fill the page. + */ + if (unlikely(!PageUptodate(pp))) { + error = zfs_fillpage(ip, pp); + if (error) { + unlock_page(pp); + put_page(pp); + return (error); + } + } + + ASSERT(PageUptodate(pp) || PageDirty(pp)); + unlock_page(pp); - pb = kmap(pp); + void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); @@ -312,9 +329,11 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) len -= bytes; off = 0; + if (error) break; } + return (error); } #endif /* _KERNEL */ @@ -449,8 +468,8 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, } } - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zdp); + if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) + return (error); *zpp = NULL; @@ -460,12 +479,12 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -474,17 +493,17 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, - B_TRUE, cr))) { + B_TRUE, cr, zfs_init_idmap))) { zrele(*zpp); *zpp = NULL; } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTDIR)); } @@ -492,14 +511,15 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, * Check accessibility of directory. */ - if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, + zfs_init_idmap))) { + zfs_exit(zfsvfs, FTAG); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } @@ -507,7 +527,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -524,6 +544,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set + * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created or trunc'd entry. * @@ -535,7 +556,8 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, - int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, + zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -550,6 +572,7 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; + boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then @@ -566,21 +589,21 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -609,7 +632,7 @@ top: zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -622,7 +645,8 @@ top: * Create a new file object and update the directory * to reference it. */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, + mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; @@ -641,7 +665,7 @@ top: } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) + cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; @@ -681,7 +705,7 @@ top: } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); @@ -714,7 +738,6 @@ top: if (have_acl) zfs_acl_ids_free(&acl_ids); - have_acl = B_FALSE; /* * A directory entry already exists for this name. @@ -736,7 +759,8 @@ top: /* * Verify requested access to file. */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, + mnt_ns))) { goto out; } @@ -774,13 +798,14 @@ out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, + zidmap_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); @@ -808,14 +833,14 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } @@ -827,14 +852,14 @@ top: * Create a new file object and update the directory * to reference it. */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) + cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; @@ -870,7 +895,7 @@ top: } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); @@ -894,7 +919,7 @@ out: *ipp = ZTOI(zp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -941,8 +966,8 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { @@ -961,11 +986,11 @@ top: NULL, realnmp))) { if (realnmp) pn_free(realnmp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } @@ -979,7 +1004,7 @@ top: mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && - !(zp->z_is_mapped); + !zn_has_cached_data(zp, 0, LLONG_MAX); mutex_exit(&zp->z_lock); /* @@ -1042,7 +1067,7 @@ top: zrele(zp); if (xzp) zrele(xzp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1067,8 +1092,10 @@ top: &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && - !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && + !zn_has_cached_data(zp, 0, LLONG_MAX) && + xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; + VERIFY_IMPLY(xattr_obj_unlinked, xzp); } if (delete_now) { @@ -1131,7 +1158,7 @@ out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1145,6 +1172,7 @@ out: * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set + * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created directory. * @@ -1157,7 +1185,7 @@ out: */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, - cred_t *cr, int flags, vsecattr_t *vsecp) + cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -1188,18 +1216,18 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, if (dirname == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) @@ -1208,14 +1236,14 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - vsecp, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); + vsecp, &acl_ids, mnt_ns)) != 0) { + zfs_exit(zfsvfs, FTAG); return (error); } /* @@ -1231,21 +1259,22 @@ top: if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } - if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, + mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } @@ -1277,7 +1306,7 @@ top: } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1323,7 +1352,7 @@ out: zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1359,8 +1388,8 @@ zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) @@ -1373,11 +1402,11 @@ top: */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } @@ -1424,7 +1453,7 @@ top: } dmu_tx_abort(tx); zrele(zp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1452,7 +1481,7 @@ out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1491,8 +1520,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) @@ -1587,11 +1616,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) if (done) break; - /* Prefetch znode */ - if (prefetch) { - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - } + if (prefetch) + dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. @@ -1611,7 +1637,7 @@ update: if (error == ENOENT) error = 0; out: - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -1629,20 +1655,29 @@ out: * RETURN: 0 (always succeeds) */ int -zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK +zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, struct kstat *sp) +#else +zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) +#endif { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); mutex_enter(&zp->z_lock); +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + zpl_generic_fillattr(user_ns, request_mask, ip, sp); +#else zpl_generic_fillattr(user_ns, ip, sp); +#endif /* * +1 link count for root inode with visible '.zfs' directory. */ @@ -1673,7 +1708,7 @@ zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, dmu_objset_id(zfsvfs->z_os); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -1808,6 +1843,7 @@ next: * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. + * mnt_ns - user namespace of the mount * * RETURN: 0 if success * error code if failure @@ -1816,11 +1852,11 @@ next: * ip - ctime updated, mtime updated if size changed. */ int -zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); - objset_t *os = zfsvfs->z_os; + objset_t *os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; @@ -1849,9 +1885,10 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if (mask == 0) return (0); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (err); ip = ZTOI(zp); + os = zfsvfs->z_os; /* * If this is a xvattr_t, then get a pointer to the structure of @@ -1862,13 +1899,13 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -1883,7 +1920,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } } @@ -1899,17 +1936,17 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -1965,7 +2002,8 @@ top: */ if (mask & ATTR_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, + mnt_ns); if (err) goto out3; @@ -1990,13 +2028,15 @@ top: XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); + skipaclchk, cr, mnt_ns); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; + uid_t uid; + gid_t gid; /* * NOTE: even if a new mode is being set, @@ -2010,9 +2050,13 @@ top: * Take ownership or chgrp to group we are a member of */ - take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), + vap->va_uid); + gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), + vap->va_gid); + take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && - zfs_groupmember(zfsvfs, vap->va_gid, cr); + zfs_groupmember(zfsvfs, gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and @@ -2028,7 +2072,7 @@ top: ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { + skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ @@ -2141,12 +2185,12 @@ top: mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, + mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, - &oldva, cr); + &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); if (err) goto out3; - trim_mask |= ATTR_MODE; } else { need_policy = TRUE; @@ -2167,7 +2211,7 @@ top: vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, - (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + zfs_zaccess_unix, zp); if (err) goto out3; @@ -2395,15 +2439,16 @@ top: if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( - vap->va_mtime, ZTOI(zp)); + zpl_inode_set_mtime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); @@ -2411,8 +2456,8 @@ top: if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); - ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, - ZTOI(zp)); + zpl_inode_set_ctime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } @@ -2512,7 +2557,7 @@ out: dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) - err2 = zfs_setattr_dir(attrzp); + err = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); @@ -2526,7 +2571,7 @@ out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -2637,6 +2682,9 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) * tnm - New entry name. * cr - credentials of caller. * flags - case flags + * rflags - RENAME_* flags + * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). + * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. * @@ -2645,7 +2693,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, - cred_t *cr, int flags) + cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); @@ -2657,15 +2705,41 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, int error = 0; int zflg = 0; boolean_t waited = B_FALSE; + /* Needed for whiteout inode creation. */ + boolean_t fuid_dirtied; + zfs_acl_ids_t acl_ids; + boolean_t have_acl = B_FALSE; + znode_t *wzp = NULL; + if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); + if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return (SET_ERROR(EINVAL)); + + /* Already checked by Linux VFS, but just to make sure. */ + if (rflags & RENAME_EXCHANGE && + (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) + return (SET_ERROR(EINVAL)); + + /* + * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the + * right kind of vattr_t for the whiteout file. These are set + * internally by ZFS so should never be incorrect. + */ + VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); + VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); + VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); + + if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; - ZFS_VERIFY_ZP(tdzp); + if ((error = zfs_verify_zp(tdzp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * We check i_sb because snapshots and the ctldir must have different @@ -2673,13 +2747,13 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } @@ -2697,7 +2771,7 @@ top: * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -2727,7 +2801,7 @@ top: * the rename() function shall return successfully * and perform no other action." */ - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } /* @@ -2799,7 +2873,7 @@ top: if (strcmp(snm, "..") == 0) serr = EINVAL; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (serr); } if (terr) { @@ -2811,7 +2885,7 @@ top: if (strcmp(tnm, "..") == 0) terr = EINVAL; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (terr); } @@ -2834,8 +2908,7 @@ top: * Note that if target and source are the same, this can be * done in a single check. */ - - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { @@ -2851,17 +2924,19 @@ top: * Does target exist? */ if (tzp) { + if (rflags & RENAME_NOREPLACE) { + error = SET_ERROR(EEXIST); + goto out; + } /* - * Source and target must be the same type. + * Source and target must be the same type (unless exchanging). */ - if (S_ISDIR(ZTOI(szp)->i_mode)) { - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - } else { - if (S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(EISDIR); + if (!(rflags & RENAME_EXCHANGE)) { + boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; + boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; + + if (s_is_dir != t_is_dir) { + error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); goto out; } } @@ -2874,12 +2949,43 @@ top: error = 0; goto out; } + } else if (rflags & RENAME_EXCHANGE) { + /* Target must exist for RENAME_EXCHANGE. */ + error = SET_ERROR(ENOENT); + goto out; + } + + /* Set up inode creation for RENAME_WHITEOUT. */ + if (rflags & RENAME_WHITEOUT) { + /* + * Whiteout files are not regular files or directories, so to + * match zfs_create() we do not inherit the project id. + */ + uint64_t wo_projid = ZFS_DEFAULT_PROJID; + + error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); + if (error) + goto out; + + if (!have_acl) { + error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, + &acl_ids, mnt_ns); + if (error) + goto out; + have_acl = B_TRUE; + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { + error = SET_ERROR(EDQUOT); + goto out; + } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, sdzp->z_id, + (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); @@ -2889,7 +2995,21 @@ top: dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } + if (rflags & RENAME_WHITEOUT) { + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); @@ -2915,62 +3035,114 @@ top: zrele(szp); if (tzp) zrele(tzp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + /* + * Unlink the source. + */ + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_pflags |= ZFS_AV_MODIFIED; - if (tdzp->z_pflags & ZFS_PROJINHERIT) - szp->z_pflags |= ZFS_PROJINHERIT; - - error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - (void *)&szp->z_pflags, sizeof (uint64_t), tx); + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + VERIFY0(error); + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error) + goto commit; + + /* + * Unlink the target. + */ + if (tzp) { + int tzflg = zflg; + + if (rflags & RENAME_EXCHANGE) { + /* This inode will be re-linked soon. */ + tzflg |= ZRENAMING; + + tzp->z_pflags |= ZFS_AV_MODIFIED; + if (sdzp->z_pflags & ZFS_PROJINHERIT) + tzp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&tzp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); + } + error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); + if (error) + goto commit_link_szp; + } - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); - } else { - /* - * At this point, we have successfully created - * the target name, but have failed to remove - * the source name. Since the create was done - * with the ZRENAMING flag, there are - * complications; for one, the link count is - * wrong. The easiest way to deal with this - * is to remove the newly created target, and - * return the original error. This must - * succeed; fortunately, it is very unlikely to - * fail, since we just created it. - */ - VERIFY3U(zfs_link_destroy(tdl, szp, tx, - ZRENAMING, NULL), ==, 0); - } - } else { - /* - * If we had removed the existing target, subsequent - * call to zfs_link_create() to add back the same entry - * but, the new dnode (szp) should not fail. - */ - ASSERT(tzp == NULL); + /* + * Create the new target links: + * * We always link the target. + * * RENAME_EXCHANGE: Link the old target to the source. + * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. + */ + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error) { + /* + * If we have removed the existing target, a subsequent call to + * zfs_link_create() to add back the same entry, but with a new + * dnode (szp), should not fail. + */ + ASSERT3P(tzp, ==, NULL); + goto commit_link_tzp; + } + + switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { + case RENAME_EXCHANGE: + error = zfs_link_create(sdl, tzp, tx, ZRENAMING); + /* + * The same argument as zfs_link_create() failing for + * szp applies here, since the source directory must + * have had an entry we are replacing. + */ + ASSERT0(error); + if (error) + goto commit_unlink_td_szp; + break; + case RENAME_WHITEOUT: + zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); + error = zfs_link_create(sdl, wzp, tx, ZNEW); + if (error) { + zfs_znode_delete(wzp, tx); + remove_inode_hash(ZTOI(wzp)); + goto commit_unlink_td_szp; } + break; } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { + case RENAME_EXCHANGE: + zfs_log_rename_exchange(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, + tdzp, tdl->dl_name, szp); + break; + case RENAME_WHITEOUT: + zfs_log_rename_whiteout(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, + tdzp, tdl->dl_name, szp, wzp); + break; + default: + ASSERT0(rflags & ~RENAME_NOREPLACE); + zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), + sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + break; + } + +commit: dmu_tx_commit(tx); out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); + if (have_acl) + zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) @@ -2981,16 +3153,57 @@ out: zfs_znode_update_vfs(szp); zrele(szp); + if (wzp) { + zfs_znode_update_vfs(wzp); + zrele(wzp); + } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); + + /* + * Clean-up path for broken link state. + * + * At this point we are in a (very) bad state, so we need to do our + * best to correct the state. In particular, all of the nlinks are + * wrong because we were destroying and creating links with ZRENAMING. + * + * In some form, all of these operations have to resolve the state: + * + * * link_destroy() *must* succeed. Fortunately, this is very likely + * since we only just created it. + * + * * link_create()s are allowed to fail (though they shouldn't because + * we only just unlinked them and are putting the entries back + * during clean-up). But if they fail, we can just forcefully drop + * the nlink value to (at the very least) avoid broken nlink values + * -- though in the case of non-empty directories we will have to + * panic (otherwise we'd have a leaked directory with a broken ..). + */ +commit_unlink_td_szp: + VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); +commit_link_tzp: + if (tzp) { + if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) + VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); + } +commit_link_szp: + if (zfs_link_create(sdl, szp, tx, ZRENAMING)) + VERIFY0(zfs_drop_nlink(szp, tx, NULL)); + goto commit; } /* @@ -3002,6 +3215,7 @@ out: * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags + * mnt_ns - user namespace of the mount * * OUT: zpp - Znode for new symbolic link. * @@ -3012,7 +3226,7 @@ out: */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, - znode_t **zpp, cred_t *cr, int flags) + znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; @@ -3032,26 +3246,26 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(dzp); + if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, - vap, cr, NULL, &acl_ids)) != 0) { - ZFS_EXIT(zfsvfs); + vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { + zfs_exit(zfsvfs, FTAG); return (error); } top: @@ -3063,21 +3277,21 @@ top: error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); @@ -3104,7 +3318,7 @@ top: } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3159,7 +3373,7 @@ top: zrele(zp); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3185,8 +3399,8 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) zfsvfs_t *zfsvfs = ITOZSB(ip); int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); mutex_enter(&zp->z_lock); if (zp->z_is_sa) @@ -3196,7 +3410,7 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3241,8 +3455,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, if (name == NULL) return (SET_ERROR(EINVAL)); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(tdzp); + if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) + return (error); zilog = zfsvfs->z_log; /* @@ -3250,11 +3464,14 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } - ZFS_VERIFY_ZP(szp); + if ((error = zfs_verify_zp(szp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * If we are using project inheritance, means if the directory has @@ -3265,7 +3482,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } @@ -3274,7 +3491,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } @@ -3282,17 +3499,17 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) @@ -3305,19 +3522,20 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } - if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, + zfs_init_idmap))) { + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3327,7 +3545,7 @@ top: */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3349,7 +3567,7 @@ top: goto top; } dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ @@ -3391,7 +3609,7 @@ top: zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3444,12 +3662,13 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (err); ASSERT(PageLocked(pp)); @@ -3461,7 +3680,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3521,7 +3740,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3549,7 +3768,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, #endif } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3557,7 +3776,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3576,11 +3795,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - err = dmu_tx_assign(tx, TXG_NOWAIT); + err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { - if (err == ERESTART) - dmu_tx_wait(tx); - dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); @@ -3592,7 +3808,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, if (!for_sync) atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -3607,28 +3823,23 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, - for_sync ? zfs_putpage_sync_commit_cb : - zfs_putpage_async_commit_cb, pp); - - dmu_tx_commit(tx); - - zfs_rangelock_exit(lr); - + boolean_t commit = B_FALSE; if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ - zil_commit(zfsvfs->z_log, zp->z_id); + commit = B_TRUE; } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { /* * If the caller does not intend to wait synchronously @@ -3638,12 +3849,23 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, * our writeback to complete. Refer to the comment in * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. */ - zil_commit(zfsvfs->z_log, zp->z_id); + commit = B_TRUE; } + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, + for_sync ? zfs_putpage_sync_commit_cb : + zfs_putpage_async_commit_cb, pp); + + dmu_tx_commit(tx); + + zfs_rangelock_exit(lr); + + if (commit) + zil_commit(zfsvfs->z_log, zp->z_id); + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (err); } @@ -3658,6 +3880,7 @@ zfs_dirty_inode(struct inode *ip, int flags) zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; @@ -3665,8 +3888,8 @@ zfs_dirty_inode(struct inode *ip, int flags) if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); #ifdef I_DIRTY_TIME /* @@ -3702,9 +3925,12 @@ zfs_dirty_inode(struct inode *ip, int flags) SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + tmp_ts = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_ts, atime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); mode = ip->i_mode; zp->z_mode = mode; @@ -3714,7 +3940,7 @@ zfs_dirty_inode(struct inode *ip, int flags) dmu_tx_commit(tx); out: - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3747,7 +3973,9 @@ zfs_inactive(struct inode *ip) if (error) { dmu_tx_abort(tx); } else { - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime; + tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); @@ -3766,55 +3994,45 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_fillpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - objset_t *os; - struct page *cur_pp; - u_offset_t io_off, total; - size_t io_len; - loff_t i_size; - unsigned page_idx; - int err; + loff_t i_size = i_size_read(ip); + u_offset_t io_off = page_offset(pp); + size_t io_len = PAGE_SIZE; - os = zfsvfs->z_os; - io_len = nr_pages << PAGE_SHIFT; - i_size = i_size_read(ip); - io_off = page_offset(pl[0]); + ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; - /* - * Iterate over list of pages and read each page individually. - */ - page_idx = 0; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; + void *va = kmap(pp); + int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + io_len, va, DMU_READ_PREFETCH); + if (io_len != PAGE_SIZE) + memset((char *)va + io_len, 0, PAGE_SIZE - io_len); + kunmap(pp); - cur_pp = pl[page_idx++]; - va = kmap(cur_pp); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - kunmap(cur_pp); - if (err) { - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = SET_ERROR(EIO); - return (err); - } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); } - return (0); + return (error); } /* - * Uses zfs_fillpage to read data from the file and fill the pages. + * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. - * pl - list of pages to read - * nr_pages - number of pages to read + * pp - page to read * * RETURN: 0 on success, error code on failure. * @@ -3822,24 +4040,22 @@ zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) * vp - atime updated */ int -zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +zfs_getpage(struct inode *ip, struct page *pp) { - znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); - int err; - - if (pl == NULL) - return (0); + znode_t *zp = ITOZ(ip); + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); - err = zfs_fillpage(ip, pl, nr_pages); + error = zfs_fillpage(ip, pp); + if (error == 0) + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE); + zfs_exit(zfsvfs, FTAG); - ZFS_EXIT(zfsvfs); - return (err); + return (error); } /* @@ -3861,28 +4077,29 @@ zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, (void) addrp; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); - if ((vm_flags & VM_WRITE) && (zp->z_pflags & - (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); + if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && + (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENXIO)); } - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -3913,11 +4130,11 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, uint64_t off, len; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -3926,12 +4143,12 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } @@ -3941,8 +4158,9 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ - if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, + zfs_init_idmap))) { + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3951,7 +4169,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, error = zfs_freesp(zp, off, len, flag, TRUE); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3966,19 +4184,23 @@ zfs_fid(struct inode *ip, fid_t *fidp) zfid_short_t *zfid; int size, i, error; - ZFS_ENTER(zfsvfs); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } - ZFS_VERIFY_ZP(zp); + if ((error = zfs_verify_zp(zp)) != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (error); } @@ -3999,7 +4221,7 @@ zfs_fid(struct inode *ip, fid_t *fidp) for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); - ZFS_EXIT(zfsvfs); + zfs_exit(zfsvfs, FTAG); return (0); } @@ -4030,5 +4252,4 @@ EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); - #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index dc504b1a120b..b99df188c64b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -169,8 +169,7 @@ zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) znode_hold_t *zh = buf; mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); - zfs_refcount_create(&zh->zh_refcount); - zh->zh_obj = ZFS_NO_OBJECT; + zh->zh_refcount = 0; return (0); } @@ -182,7 +181,6 @@ zfs_znode_hold_cache_destructor(void *buf, void *arg) znode_hold_t *zh = buf; mutex_destroy(&zh->zh_lock); - zfs_refcount_destroy(&zh->zh_refcount); } void @@ -273,7 +271,7 @@ zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) return (held); } -static znode_hold_t * +znode_hold_t * zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, *zh_new, search; @@ -281,43 +279,43 @@ zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) boolean_t found = B_FALSE; zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); - zh_new->zh_obj = obj; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); if (likely(zh == NULL)) { zh = zh_new; + zh->zh_obj = obj; avl_add(&zfsvfs->z_hold_trees[i], zh); } else { ASSERT3U(zh->zh_obj, ==, obj); found = B_TRUE; } - zfs_refcount_add(&zh->zh_refcount, NULL); + zh->zh_refcount++; + ASSERT3S(zh->zh_refcount, >, 0); mutex_exit(&zfsvfs->z_hold_locks[i]); if (found == B_TRUE) kmem_cache_free(znode_hold_cache, zh_new); ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); - ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); mutex_enter(&zh->zh_lock); return (zh); } -static void +void zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) { int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); boolean_t remove = B_FALSE; ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); - ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); mutex_exit(&zh->zh_lock); mutex_enter(&zfsvfs->z_hold_locks[i]); - if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { + ASSERT3S(zh->zh_refcount, >, 0); + if (--zh->zh_refcount == 0) { avl_remove(&zfsvfs->z_hold_trees[i], zh); remove = B_TRUE; } @@ -359,7 +357,7 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, void zfs_znode_dmu_fini(znode_t *zp) { - ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || + ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); @@ -392,7 +390,6 @@ zfs_inode_destroy(struct inode *ip) mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; } mutex_exit(&zfsvfs->z_znodes_lock); @@ -417,12 +414,21 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + ip->i_fop = &zpl_file_operations.kabi_fops; +#else ip->i_fop = &zpl_file_operations; +#endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; case S_IFDIR: +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER + ip->i_flags |= S_IOPS_WRAPPER; + ip->i_op = &zpl_dir_inode_operations.ops; +#else ip->i_op = &zpl_dir_inode_operations; +#endif ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; @@ -452,7 +458,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + ip->i_fop = &zpl_file_operations.kabi_fops; +#else ip->i_fop = &zpl_file_operations; +#endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; } @@ -492,13 +502,11 @@ zfs_set_inode_flags(znode_t *zp, struct inode *ip) void zfs_znode_update_vfs(znode_t *zp) { - zfsvfs_t *zfsvfs; struct inode *ip; uint32_t blksize; u_longlong_t i_blocks; ASSERT(zp != NULL); - zfsvfs = ZTOZSB(zp); ip = ZTOI(zp); /* Skip .zfs control nodes which do not exist on disk. */ @@ -534,6 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; @@ -550,9 +559,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; +#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; +#endif zp->z_is_ctldir = B_FALSE; - zp->z_is_stale = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; @@ -604,9 +614,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; - ZFS_TIME_DECODE(&ip->i_atime, atime); - ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&ip->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; @@ -631,7 +644,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) @@ -1187,6 +1199,7 @@ zfs_rezget(znode_t *zp) uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; @@ -1279,9 +1292,12 @@ zfs_rezget(znode_t *zp) zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); - ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { @@ -1389,21 +1405,24 @@ zfs_zinactive(znode_t *zp) boolean_t zfs_relatime_need_update(const struct inode *ip) { - inode_timespec_t now; + inode_timespec_t now, tmp_atime, tmp_ts; gethrestime(&now); + tmp_atime = zpl_inode_get_atime(ip); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ - if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_mtime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_ctime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) + if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); @@ -1426,7 +1445,7 @@ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { - inode_timespec_t now; + inode_timespec_t now, tmp_ts; gethrestime(&now); @@ -1434,7 +1453,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); @@ -1443,7 +1463,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } @@ -1641,7 +1662,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ - if (zp->z_is_mapped) { + if (zn_has_cached_data(zp, off, off + len - 1)) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; @@ -1864,7 +1885,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; - char *name; + const char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); @@ -1883,6 +1904,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + ASSERT(error == 0); /* * Create zap object used for SA attribute registration @@ -1960,7 +1982,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, - cr, NULL, &acl_ids)); + cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); @@ -2136,7 +2158,6 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, } else if (error != ENOENT) { return (error); } - error = 0; for (;;) { uint64_t pobj = 0; @@ -2252,6 +2273,91 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c index dcab02b07894..21f3740f6fe6 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c @@ -223,14 +223,32 @@ int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; - crypto_mechanism_t mech; + crypto_mechanism_t mech = {0}; uint_t keydata_len; ASSERT(key != NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); +/* + * Workaround for GCC 12+ with UBSan enabled deficencies. + * + * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code + * below as violating -Warray-bounds + */ +#if defined(__GNUC__) && !defined(__clang__) && \ + ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \ + defined(CONFIG_UBSAN)) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif keydata_len = zio_crypt_table[crypt].ci_keylen; +#if defined(__GNUC__) && !defined(__clang__) && \ + ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \ + defined(CONFIG_UBSAN)) +#pragma GCC diagnostic pop +#endif memset(key, 0, sizeof (zio_crypt_key_t)); + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); @@ -282,7 +300,6 @@ zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; - rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); return (0); @@ -1388,7 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, boolean_t *no_crypt) { int ret; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; @@ -1415,7 +1432,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { @@ -1428,6 +1448,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) @@ -1496,20 +1518,16 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, * authenticate it. */ if (txtype == TX_WRITE) { - crypt_len = sizeof (lr_write_t) - - sizeof (lr_t) - sizeof (blkptr_t); + const size_t o = offsetof(lr_write_t, lr_blkptr); + crypt_len = o - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ - memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), - slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - sizeof (blkptr_t)); - memcpy(aadp, - slrp + sizeof (lr_write_t) - sizeof (blkptr_t), - sizeof (blkptr_t)); + memcpy(dlrp + o, slrp + o, sizeof (blkptr_t)); + memcpy(aadp, slrp + o, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); nr_iovecs++; @@ -1526,6 +1544,21 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, nr_iovecs++; total_len += crypt_len; } + } else if (txtype == TX_CLONE_RANGE) { + const size_t o = offsetof(lr_clone_range_t, lr_nbps); + crypt_len = o - sizeof (lr_t); + src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bps now since they will not be encrypted */ + memcpy(dlrp + o, slrp + o, lr_len - o); + memcpy(aadp, slrp + o, lr_len - o); + aadp += lr_len - o; + aad_len += lr_len - o; + nr_iovecs++; + total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); @@ -1891,6 +1924,9 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, crypto_ctx_template_t tmpl; uint8_t *authbuf = NULL; + memset(&puio, 0, sizeof (puio)); + memset(&cuio, 0, sizeof (cuio)); + /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. @@ -1950,9 +1986,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, /* If the hardware implementation fails fall back to software */ } - memset(&puio, 0, sizeof (puio)); - memset(&cuio, 0, sizeof (cuio)); - /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, @@ -1968,7 +2001,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, if (locked) { rw_exit(&key->zk_salt_lock); - locked = B_FALSE; } if (authbuf != NULL) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index d5c222120a9d..8ee7fcecc7b7 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -42,7 +42,7 @@ static int zpl_common_open(struct inode *ip, struct file *filp) { - if (filp->f_mode & FMODE_WRITE) + if (blk_mode_is_open_write(filp->f_mode)) return (-EACCES); return (generic_file_open(ip, filp)); @@ -57,7 +57,8 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); if (!zpl_dir_emit_dots(filp, ctx)) goto out; @@ -78,7 +79,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) ctx->pos++; } out: - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (error); } @@ -102,7 +103,11 @@ zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) * Get root directory attributes. */ static int -#ifdef HAVE_USERNS_IOPS_GETATTR +#ifdef HAVE_IDMAP_IOPS_GETATTR +zpl_root_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_root_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -114,9 +119,13 @@ zpl_root_getattr_impl(const struct path *path, struct kstat *stat, (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; -#ifdef HAVE_USERNS_IOPS_GETATTR +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP) + generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -207,7 +216,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) return (!!dentry->d_inode); } -static const dentry_operations_t zpl_dops_snapdirs = { +static dentry_operations_t zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() @@ -258,7 +267,8 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) uint64_t id, pos; int error = 0; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) @@ -282,7 +292,7 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) } out: spl_fstrans_unmark(cookie); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); if (error == -ENOENT) return (0); @@ -310,6 +320,10 @@ static int zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#elif defined(HAVE_IOPS_RENAME_IDMAP) +zpl_snapdir_rename2(struct mnt_idmap *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) #else zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) @@ -331,7 +345,9 @@ zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, return (error); } -#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) +#if (!defined(HAVE_RENAME_WANTS_FLAGS) && \ + !defined(HAVE_IOPS_RENAME_USERNS) && \ + !defined(HAVE_IOPS_RENAME_IDMAP)) static int zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -358,6 +374,9 @@ static int #ifdef HAVE_IOPS_MKDIR_USERNS zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) +#elif defined(HAVE_IOPS_MKDIR_IDMAP) +zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip, + struct dentry *dentry, umode_t mode) #else zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) #endif @@ -369,7 +388,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dip, mode | S_IFDIR, cr); +#if (defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) + zpl_vap_init(vap, dip, mode | S_IFDIR, cr, user_ns); +#else + zpl_vap_init(vap, dip, mode | S_IFDIR, cr, zfs_init_idmap); +#endif error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { @@ -389,7 +412,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) * Get snapshot directory attributes. */ static int -#ifdef HAVE_USERNS_IOPS_GETATTR +#ifdef HAVE_IDMAP_IOPS_GETATTR +zpl_snapdir_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_snapdir_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -401,11 +428,17 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; - ZPL_ENTER(zfsvfs); -#ifdef HAVE_USERNS_IOPS_GETATTR + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP) + generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -422,7 +455,7 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, dmu_objset_pool(ds->ds_objset)->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (err != 0) { - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (-err); } stat->nlink += snap_count; @@ -430,7 +463,7 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (0); } @@ -463,7 +496,9 @@ const struct file_operations zpl_fops_snapdir = { const struct inode_operations zpl_ops_snapdir = { .lookup = zpl_snapdir_lookup, .getattr = zpl_snapdir_getattr, -#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) +#if (defined(HAVE_RENAME_WANTS_FLAGS) || \ + defined(HAVE_IOPS_RENAME_USERNS) || \ + defined(HAVE_IOPS_RENAME_IDMAP)) .rename = zpl_snapdir_rename2, #else .rename = zpl_snapdir_rename, @@ -508,7 +543,8 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) znode_t *dzp; int error = 0; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { @@ -527,7 +563,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); @@ -553,6 +589,10 @@ static int zpl_shares_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#elif defined(HAVE_IDMAP_IOPS_GETATTR) +zpl_shares_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) #else zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -564,12 +604,17 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, znode_t *dzp; int error; - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); if (zfsvfs->z_shares_dir == 0) { -#ifdef HAVE_USERNS_IOPS_GETATTR +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, path->dentry->d_inode, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP) + generic_fillattr(user_ns, path->dentry->d_inode, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK) + generic_fillattr(user_ns, request_mask, ip, stat); #else (void) user_ns; #endif @@ -578,25 +623,24 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, #endif stat->nlink = stat->size = 2; stat->atime = current_time(ip); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (0); } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { -#ifdef HAVE_USERNS_IOPS_GETATTR -#ifdef HAVE_GENERIC_FILLATTR_USERNS +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ZTOI(dzp), + stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); #else - (void) user_ns; -#endif -#else error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); #endif iput(ZTOI(dzp)); } - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c index 5be63532d329..aa80b72e2d7a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 9a640fb40b67..9dec52215c7c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -27,6 +27,7 @@ #ifdef CONFIG_COMPAT #include <linux/compat.h> #endif +#include <linux/fs.h> #include <sys/file.h> #include <sys/dmu_objset.h> #include <sys/zfs_znode.h> @@ -37,6 +38,9 @@ defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include <linux/pagemap.h> #endif +#ifdef HAVE_FILE_FADVISE +#include <linux/fadvise.h> +#endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include <linux/writeback.h> #endif @@ -191,9 +195,12 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * zfs_putpage() respectively. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - ZPL_ENTER(zfsvfs); + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { + atomic_dec_32(&zp->z_sync_writes_cnt); + return (error); + } zil_commit(zfsvfs->z_log, zp->z_id); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); } error = filemap_write_and_wait_range(inode->i_mapping, start, end); @@ -294,16 +301,11 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); #else -#ifdef HAVE_IOV_ITER_TYPE - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, - iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, - count, skip); -#else - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, - to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, + zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, + zfs_uio_iov_iter_type(to) & ITER_KVEC ? + UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif -#endif } static ssize_t @@ -618,7 +620,6 @@ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; - znode_t *zp = ITOZ(ip); int error; fstrans_cookie_t cookie; @@ -633,9 +634,12 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) if (error) return (error); +#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) + znode_t *zp = ITOZ(ip); mutex_enter(&zp->z_lock); zp->z_is_mapped = B_TRUE; mutex_exit(&zp->z_lock); +#endif return (error); } @@ -648,29 +652,16 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) static inline int zpl_readpage_common(struct page *pp) { - struct inode *ip; - struct page *pl[1]; - int error = 0; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); - ip = pp->mapping->host; - pl[0] = pp; cookie = spl_fstrans_mark(); - error = -zfs_getpage(ip, pl, 1); + int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); - if (error) { - SetPageError(pp); - ClearPageUptodate(pp); - } else { - ClearPageError(pp); - SetPageUptodate(pp); - flush_dcache_page(pp); - } - unlock_page(pp); + return (error); } @@ -729,15 +720,38 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { boolean_t *for_sync = data; fstrans_cookie_t cookie; + int ret; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); - (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); + ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); - return (0); + return (ret); +} + +#ifdef HAVE_WRITEPAGE_T_FOLIO +static int +zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) +{ + return (zpl_putpage(&pp->page, wbc, data)); +} +#endif + +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data) +{ + int result; + +#ifdef HAVE_WRITEPAGE_T_FOLIO + result = write_cache_pages(mapping, wbc, zpl_putfolio, data); +#else + result = write_cache_pages(mapping, wbc, zpl_putpage, data); +#endif + return (result); } static int @@ -748,10 +762,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) enum writeback_sync_modes sync_mode; int result; - ZPL_ENTER(zfsvfs); + if ((result = zpl_enter(zfsvfs, FTAG)) != 0) + return (result); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); sync_mode = wbc->sync_mode; /* @@ -763,13 +778,13 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) */ boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; - result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync); + result = zpl_write_cache_pages(mapping, wbc, &for_sync); if (sync_mode != wbc->sync_mode) { - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (result); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); /* * We need to call write_cache_pages() again (we can't just @@ -779,8 +794,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; - result = write_cache_pages(mapping, wbc, zpl_putpage, - &for_sync); + result = zpl_write_cache_pages(mapping, wbc, &for_sync); } return (result); } @@ -906,6 +920,61 @@ zpl_ioctl_getversion(struct file *filp, void __user *arg) return (copy_to_user(arg, &generation, sizeof (generation))); } +#ifdef HAVE_FILE_FADVISE +static int +zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) +{ + struct inode *ip = file_inode(filp); + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os = zfsvfs->z_os; + int error = 0; + + if (S_ISFIFO(ip->i_mode)) + return (-ESPIPE); + + if (offset < 0 || len < 0) + return (-EINVAL); + + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + + switch (advice) { + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: +#ifdef HAVE_GENERIC_FADVISE + if (zn_has_cached_data(zp, offset, offset + len - 1)) + error = generic_fadvise(filp, offset, len, advice); +#endif + /* + * Pass on the caller's size directly, but note that + * dmu_prefetch_max will effectively cap it. If there + * really is a larger sequential access pattern, perhaps + * dmu_zfetch will detect it. + */ + if (len == 0) + len = i_size_read(ip) - offset; + + dmu_prefetch(os, zp->z_id, 0, offset, len, + ZIO_PRIORITY_ASYNC_READ); + break; + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_DONTNEED: + case POSIX_FADV_NOREUSE: + /* ignored for now */ + break; + default: + error = -EINVAL; + break; + } + + zfs_exit(zfsvfs, FTAG); + + return (error); +} +#endif /* HAVE_FILE_FADVISE */ + #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) @@ -975,7 +1044,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); @@ -1022,7 +1091,7 @@ zpl_ioctl_setflags(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); @@ -1070,7 +1139,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); @@ -1105,7 +1174,7 @@ __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); @@ -1158,7 +1227,7 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); @@ -1183,6 +1252,12 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FICLONE: + return (zpl_ioctl_ficlone(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FICLONERANGE: + return (zpl_ioctl_ficlonerange(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FIDEDUPERANGE: + return (zpl_ioctl_fideduperange(filp, (void *)arg)); default: return (-ENOTTY); } @@ -1209,7 +1284,6 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } #endif /* CONFIG_COMPAT */ - const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, @@ -1232,7 +1306,12 @@ const struct address_space_operations zpl_address_space_operations = { #endif }; +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND +const struct file_operations_extend zpl_file_operations = { + .kabi_fops = { +#else const struct file_operations zpl_file_operations = { +#endif .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, @@ -1244,7 +1323,11 @@ const struct file_operations zpl_file_operations = { .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER +#ifdef HAVE_COPY_SPLICE_READ + .splice_read = copy_splice_read, +#else .splice_read = generic_file_splice_read, +#endif .splice_write = iter_file_splice_write, #endif #else @@ -1259,10 +1342,30 @@ const struct file_operations zpl_file_operations = { .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, +#ifdef HAVE_VFS_COPY_FILE_RANGE + .copy_file_range = zpl_copy_file_range, +#endif +#ifdef HAVE_VFS_CLONE_FILE_RANGE + .clone_file_range = zpl_clone_file_range, +#endif +#ifdef HAVE_VFS_REMAP_FILE_RANGE + .remap_file_range = zpl_remap_file_range, +#endif +#ifdef HAVE_VFS_DEDUPE_FILE_RANGE + .dedupe_file_range = zpl_dedupe_file_range, +#endif +#ifdef HAVE_FILE_FADVISE + .fadvise = zpl_fadvise, +#endif .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif +#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND + }, /* kabi_fops */ + .copy_file_range = zpl_copy_file_range, + .clone_file_range = zpl_clone_file_range, +#endif }; const struct file_operations zpl_dir_file_operations = { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c new file mode 100644 index 000000000000..64728fdb1187 --- /dev/null +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file_range.c @@ -0,0 +1,299 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2023, Klara Inc. + */ + +#ifdef CONFIG_COMPAT +#include <linux/compat.h> +#endif +#include <linux/fs.h> +#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE +#include <linux/splice.h> +#endif +#include <sys/file.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_vnops.h> +#include <sys/zfeature.h> + +/* + * Clone part of a file via block cloning. + * + * Note that we are not required to update file offsets; the kernel will take + * care of that depending on how it was called. + */ +static ssize_t +zpl_clone_file_range_impl(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len) +{ + struct inode *src_i = file_inode(src_file); + struct inode *dst_i = file_inode(dst_file); + uint64_t src_off_o = (uint64_t)src_off; + uint64_t dst_off_o = (uint64_t)dst_off; + uint64_t len_o = (uint64_t)len; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int err; + + if (!zfs_bclone_enabled) + return (-EOPNOTSUPP); + + if (!spa_feature_is_enabled( + dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING)) + return (-EOPNOTSUPP); + + if (src_i != dst_i) + spl_inode_lock_shared(src_i); + spl_inode_lock(dst_i); + + crhold(cr); + cookie = spl_fstrans_mark(); + + err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i), + &dst_off_o, &len_o, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + spl_inode_unlock(dst_i); + if (src_i != dst_i) + spl_inode_unlock_shared(src_i); + + if (err < 0) + return (err); + + return ((ssize_t)len_o); +} + +#if defined(HAVE_VFS_COPY_FILE_RANGE) || \ + defined(HAVE_VFS_FILE_OPERATIONS_EXTEND) +/* + * Entry point for copy_file_range(). Copy len bytes from src_off in src_file + * to dst_off in dst_file. We are permitted to do this however we like, so we + * try to just clone the blocks, and if we can't support it, fall back to the + * kernel's generic byte copy function. + */ +ssize_t +zpl_copy_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) +{ + ssize_t ret; + + /* Flags is reserved for future extensions and must be zero. */ + if (flags != 0) + return (-EINVAL); + + /* Try to do it via zfs_clone_range() and allow shortening. */ + ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + +#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE) + /* + * Since Linux 5.3 the filesystem driver is responsible for executing + * an appropriate fallback, and a generic fallback function is provided. + */ + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || + ret == -EAGAIN) + ret = generic_copy_file_range(src_file, src_off, dst_file, + dst_off, len, flags); +#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE) + /* + * Since 6.8 the fallback function is called splice_copy_file_range + * and has a slightly different signature. + */ + if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV || + ret == -EAGAIN) + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); +#else + /* + * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal + * to the kernel that it should fallback to a content copy. + */ + if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN) + ret = -EOPNOTSUPP; +#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */ + + return (ret); +} +#endif /* HAVE_VFS_COPY_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ + +#ifdef HAVE_VFS_REMAP_FILE_RANGE +/* + * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE. + * + * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except + * that they must clone - they cannot fall back to copying. FICLONE is exactly + * FICLONERANGE, for the entire file. We don't need to try to tell them apart; + * the kernel will sort that out for us. + * + * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the + * range in both files and if they're the same, arrange for them to be backed + * by the same storage. + * + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range + * if we want. It's designed for filesystems that may need to shorten the + * length for alignment, EOF, or any other requirement. ZFS may shorten the + * request when there is outstanding dirty data which hasn't been written. + */ +loff_t +zpl_remap_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags) +{ + if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) + return (-EINVAL); + + /* No support for dedup yet */ + if (flags & REMAP_FILE_DEDUP) + return (-EOPNOTSUPP); + + /* Zero length means to clone everything to the end of the file */ + if (len == 0) + len = i_size_read(file_inode(src_file)) - src_off; + + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); +} +#endif /* HAVE_VFS_REMAP_FILE_RANGE */ + +#if defined(HAVE_VFS_CLONE_FILE_RANGE) || \ + defined(HAVE_VFS_FILE_OPERATIONS_EXTEND) +/* + * Entry point for FICLONE and FICLONERANGE, before Linux 4.20. + */ +int +zpl_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len) +{ + /* Zero length means to clone everything to the end of the file */ + if (len == 0) + len = i_size_read(file_inode(src_file)) - src_off; + + /* The entire length must be cloned or this is an error. */ + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); +} +#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ + +#ifdef HAVE_VFS_DEDUPE_FILE_RANGE +/* + * Entry point for FIDEDUPERANGE, before Linux 4.20. + */ +int +zpl_dedupe_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, uint64_t len) +{ + /* No support for dedup yet */ + return (-EOPNOTSUPP); +} +#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */ + +/* Entry point for FICLONE, before Linux 4.5. */ +long +zpl_ioctl_ficlone(struct file *dst_file, void *arg) +{ + unsigned long sfd = (unsigned long)arg; + + struct file *src_file = fget(sfd); + if (src_file == NULL) + return (-EBADF); + + if (dst_file->f_op != src_file->f_op) { + fput(src_file); + return (-EXDEV); + } + + size_t len = i_size_read(file_inode(src_file)); + + ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len); + + fput(src_file); + + if (ret < 0) { + if (ret == -EOPNOTSUPP) + return (-ENOTTY); + return (ret); + } + + if (ret != len) + return (-EINVAL); + + return (0); +} + +/* Entry point for FICLONERANGE, before Linux 4.5. */ +long +zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) +{ + zfs_ioc_compat_file_clone_range_t fcr; + + if (copy_from_user(&fcr, arg, sizeof (fcr))) + return (-EFAULT); + + struct file *src_file = fget(fcr.fcr_src_fd); + if (src_file == NULL) + return (-EBADF); + + if (dst_file->f_op != src_file->f_op) { + fput(src_file); + return (-EXDEV); + } + + size_t len = fcr.fcr_src_length; + if (len == 0) + len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; + + ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset, + dst_file, fcr.fcr_dest_offset, len); + + fput(src_file); + + if (ret < 0) { + if (ret == -EOPNOTSUPP) + return (-ENOTTY); + return (ret); + } + + if (ret != len) + return (-EINVAL); + + return (0); +} + +/* Entry point for FIDEDUPERANGE, before Linux 4.5. */ +long +zpl_ioctl_fideduperange(struct file *filp, void *arg) +{ + (void) arg; + + /* No support for dedup yet */ + return (-ENOTTY); +} diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index 4f79265a0856..ad1753f7a071 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -24,6 +24,7 @@ */ +#include <sys/sysmacros.h> #include <sys/zfs_ctldir.h> #include <sys/zfs_vfsops.h> #include <sys/zfs_vnops.h> @@ -33,7 +34,6 @@ #include <sys/zpl.h> #include <sys/file.h> - static struct dentry * zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -112,18 +112,22 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } void -zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr) +zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, + zidmap_t *mnt_ns) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; - vap->va_uid = crgetuid(cr); - if (dir && dir->i_mode & S_ISGID) { + vap->va_uid = zfs_vfsuid_to_uid(mnt_ns, + zfs_i_user_ns(dir), crgetuid(cr)); + + if (dir->i_mode & S_ISGID) { vap->va_gid = KGID_TO_SGID(dir->i_gid); if (S_ISDIR(mode)) vap->va_mode |= S_ISGID; } else { - vap->va_gid = crgetgid(cr); + vap->va_gid = zfs_vfsgid_to_gid(mnt_ns, + zfs_i_user_ns(dir), crgetgid(cr)); } } @@ -131,6 +135,9 @@ static int #ifdef HAVE_IOPS_CREATE_USERNS zpl_create(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) +#elif defined(HAVE_IOPS_CREATE_IDMAP) +zpl_create(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool flag) #else zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) #endif @@ -140,14 +147,17 @@ zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) vattr_t *vap; int error; fstrans_cookie_t cookie; +#if !(defined(HAVE_IOPS_CREATE_USERNS) || defined(HAVE_IOPS_CREATE_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); + zpl_vap_init(vap, dir, mode, cr, user_ns); cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, - mode, &zp, cr, 0, NULL); + mode, &zp, cr, 0, NULL, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) @@ -174,6 +184,9 @@ static int #ifdef HAVE_IOPS_MKNOD_USERNS zpl_mknod(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, +#elif defined(HAVE_IOPS_MKNOD_IDMAP) +zpl_mknod(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, #else zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, #endif @@ -184,6 +197,9 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, vattr_t *vap; int error; fstrans_cookie_t cookie; +#if !(defined(HAVE_IOPS_MKNOD_USERNS) || defined(HAVE_IOPS_MKNOD_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; +#endif /* * We currently expect Linux to supply rdev=0 for all sockets @@ -194,12 +210,12 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); + zpl_vap_init(vap, dir, mode, cr, user_ns); vap->va_rdev = rdev; cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, - mode, &zp, cr, 0, NULL); + mode, &zp, cr, 0, NULL, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) @@ -224,18 +240,29 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, #ifdef HAVE_TMPFILE static int +#ifdef HAVE_TMPFILE_IDMAP +zpl_tmpfile(struct mnt_idmap *userns, struct inode *dir, + struct file *file, umode_t mode) +#elif !defined(HAVE_TMPFILE_DENTRY) +zpl_tmpfile(struct user_namespace *userns, struct inode *dir, + struct file *file, umode_t mode) +#else #ifdef HAVE_TMPFILE_USERNS zpl_tmpfile(struct user_namespace *userns, struct inode *dir, struct dentry *dentry, umode_t mode) #else zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) #endif +#endif { cred_t *cr = CRED(); struct inode *ip; vattr_t *vap; int error; fstrans_cookie_t cookie; +#if !(defined(HAVE_TMPFILE_USERNS) || defined(HAVE_TMPFILE_IDMAP)) + zidmap_t *userns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); @@ -245,18 +272,28 @@ zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) */ if (!IS_POSIXACL(dir)) mode &= ~current_umask(); - zpl_vap_init(vap, dir, mode, cr); + zpl_vap_init(vap, dir, mode, cr, userns); cookie = spl_fstrans_mark(); - error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); + error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL, userns); if (error == 0) { /* d_tmpfile will do drop_nlink, so we should set it first */ set_nlink(ip, 1); +#ifndef HAVE_TMPFILE_DENTRY + d_tmpfile(file, ip); + + error = zpl_xattr_security_init(ip, dir, + &file->f_path.dentry->d_name); +#else d_tmpfile(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); +#endif if (error == 0) error = zpl_init_acl(ip, dir); +#ifndef HAVE_TMPFILE_DENTRY + error = finish_open_simple(file, error); +#endif /* * don't need to handle error here, file is already in * unlinked set. @@ -302,6 +339,9 @@ static int #ifdef HAVE_IOPS_MKDIR_USERNS zpl_mkdir(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode) +#elif defined(HAVE_IOPS_MKDIR_IDMAP) +zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode) #else zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) #endif @@ -311,13 +351,17 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) znode_t *zp; int error; fstrans_cookie_t cookie; +#if !(defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode | S_IFDIR, cr); + zpl_vap_init(vap, dir, mode | S_IFDIR, cr, user_ns); cookie = spl_fstrans_mark(); - error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL); + error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL, + user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) @@ -371,6 +415,10 @@ static int zpl_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#elif defined(HAVE_IDMAP_IOPS_GETATTR) +zpl_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) #else zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -387,7 +435,9 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, * XXX query_flags currently ignored. */ -#ifdef HAVE_USERNS_IOPS_GETATTR +#ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK + error = -zfs_getattr_fast(user_ns, request_mask, ip, stat); +#elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ip, stat); #else error = -zfs_getattr_fast(kcred->user_ns, ip, stat); @@ -426,9 +476,12 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, ZPL_GETATTR_WRAPPER(zpl_getattr); static int -#ifdef HAVE_SETATTR_PREPARE_USERNS +#ifdef HAVE_USERNS_IOPS_SETATTR zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry, struct iattr *ia) +#elif defined(HAVE_IDMAP_IOPS_SETATTR) +zpl_setattr(struct mnt_idmap *user_ns, struct dentry *dentry, + struct iattr *ia) #else zpl_setattr(struct dentry *dentry, struct iattr *ia) #endif @@ -439,7 +492,13 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) int error; fstrans_cookie_t cookie; - error = zpl_setattr_prepare(kcred->user_ns, dentry, ia); +#ifdef HAVE_SETATTR_PREPARE_USERNS + error = zpl_setattr_prepare(user_ns, dentry, ia); +#elif defined(HAVE_SETATTR_PREPARE_IDMAP) + error = zpl_setattr_prepare(user_ns, dentry, ia); +#else + error = zpl_setattr_prepare(zfs_init_idmap, dentry, ia); +#endif if (error) return (error); @@ -447,18 +506,37 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; vap->va_mode = ia->ia_mode; - vap->va_uid = KUID_TO_SUID(ia->ia_uid); - vap->va_gid = KGID_TO_SGID(ia->ia_gid); + if (ia->ia_valid & ATTR_UID) +#ifdef HAVE_IATTR_VFSID + vap->va_uid = zfs_vfsuid_to_uid(user_ns, zfs_i_user_ns(ip), + __vfsuid_val(ia->ia_vfsuid)); +#else + vap->va_uid = KUID_TO_SUID(ia->ia_uid); +#endif + if (ia->ia_valid & ATTR_GID) +#ifdef HAVE_IATTR_VFSID + vap->va_gid = zfs_vfsgid_to_gid(user_ns, zfs_i_user_ns(ip), + __vfsgid_val(ia->ia_vfsgid)); +#else + vap->va_gid = KGID_TO_SGID(ia->ia_gid); +#endif vap->va_size = ia->ia_size; vap->va_atime = ia->ia_atime; vap->va_mtime = ia->ia_mtime; vap->va_ctime = ia->ia_ctime; if (vap->va_mask & ATTR_ATIME) - ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); + zpl_inode_set_atime_to_ts(ip, + zpl_inode_timestamp_truncate(ia->ia_atime, ip)); cookie = spl_fstrans_mark(); - error = -zfs_setattr(ITOZ(ip), vap, 0, cr); +#ifdef HAVE_USERNS_IOPS_SETATTR + error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns); +#elif defined(HAVE_IDMAP_IOPS_SETATTR) + error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns); +#else + error = -zfs_setattr(ITOZ(ip), vap, 0, cr, zfs_init_idmap); +#endif if (!error && (ia->ia_valid & ATTR_MODE)) error = zpl_chmod_acl(ip); @@ -474,32 +552,47 @@ static int #ifdef HAVE_IOPS_RENAME_USERNS zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, - unsigned int flags) + unsigned int rflags) +#elif defined(HAVE_IOPS_RENAME_IDMAP) +zpl_rename2(struct mnt_idmap *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int rflags) #else zpl_rename2(struct inode *sdip, struct dentry *sdentry, - struct inode *tdip, struct dentry *tdentry, unsigned int flags) + struct inode *tdip, struct dentry *tdentry, unsigned int rflags) #endif { cred_t *cr = CRED(); + vattr_t *wo_vap = NULL; int error; fstrans_cookie_t cookie; - - /* We don't have renameat2(2) support */ - if (flags) - return (-EINVAL); +#if !(defined(HAVE_IOPS_RENAME_USERNS) || defined(HAVE_IOPS_RENAME_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; +#endif crhold(cr); + if (rflags & RENAME_WHITEOUT) { + wo_vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(wo_vap, sdip, S_IFCHR, cr, user_ns); + wo_vap->va_rdev = makedevice(0, 0); + } + cookie = spl_fstrans_mark(); error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), - dname(tdentry), cr, 0); + dname(tdentry), cr, 0, rflags, wo_vap, user_ns); spl_fstrans_unmark(cookie); + if (wo_vap) + kmem_free(wo_vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } -#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) +#if !defined(HAVE_IOPS_RENAME_USERNS) && \ + !defined(HAVE_RENAME_WANTS_FLAGS) && \ + !defined(HAVE_RENAME2) && \ + !defined(HAVE_IOPS_RENAME_IDMAP) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -512,6 +605,9 @@ static int #ifdef HAVE_IOPS_SYMLINK_USERNS zpl_symlink(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, const char *name) +#elif defined(HAVE_IOPS_SYMLINK_IDMAP) +zpl_symlink(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, const char *name) #else zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) #endif @@ -521,14 +617,17 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) znode_t *zp; int error; fstrans_cookie_t cookie; +#if !(defined(HAVE_IOPS_SYMLINK_USERNS) || defined(HAVE_IOPS_SYMLINK_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); + zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr, user_ns); cookie = spl_fstrans_mark(); error = -zfs_symlink(ITOZ(dir), dname(dentry), vap, - (char *)name, &zp, cr, 0); + (char *)name, &zp, cr, 0, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error) { @@ -678,7 +777,7 @@ zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) return (-EMLINK); crhold(cr); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); /* Must have an existing ref, so igrab() cannot return NULL */ VERIFY3P(igrab(ip), !=, NULL); @@ -698,46 +797,6 @@ out: return (error); } -static int -#ifdef HAVE_D_REVALIDATE_NAMEIDATA -zpl_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - unsigned int flags = (nd ? nd->flags : 0); -#else -zpl_revalidate(struct dentry *dentry, unsigned int flags) -{ -#endif /* HAVE_D_REVALIDATE_NAMEIDATA */ - /* CSTYLED */ - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - int error; - - if (flags & LOOKUP_RCU) - return (-ECHILD); - - /* - * After a rollback negative dentries created before the rollback - * time must be invalidated. Otherwise they can obscure files which - * are only present in the rolled back dataset. - */ - if (dentry->d_inode == NULL) { - spin_lock(&dentry->d_lock); - error = time_before(dentry->d_time, zfsvfs->z_rollback_time); - spin_unlock(&dentry->d_lock); - - if (error) - return (0); - } - - /* - * The dentry may reference a stale inode if a mounted file system - * was rolled back to a point in time where the object didn't exist. - */ - if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) - return (0); - - return (1); -} - const struct inode_operations zpl_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, @@ -751,11 +810,20 @@ const struct inode_operations zpl_inode_operations = { #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ +#if defined(HAVE_GET_INODE_ACL) + .get_inode_acl = zpl_get_acl, +#else .get_acl = zpl_get_acl, +#endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ }; +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER +const struct inode_operations_wrapper zpl_dir_inode_operations = { + .ops = { +#else const struct inode_operations zpl_dir_inode_operations = { +#endif .create = zpl_create, .lookup = zpl_lookup, .link = zpl_link, @@ -764,7 +832,11 @@ const struct inode_operations zpl_dir_inode_operations = { .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, -#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) +#ifdef HAVE_RENAME2 + .rename2 = zpl_rename2, +#elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) + .rename = zpl_rename2, +#elif defined(HAVE_IOPS_RENAME_IDMAP) .rename = zpl_rename2, #else .rename = zpl_rename, @@ -784,8 +856,16 @@ const struct inode_operations zpl_dir_inode_operations = { #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ +#if defined(HAVE_GET_INODE_ACL) + .get_inode_acl = zpl_get_acl, +#else .get_acl = zpl_get_acl, +#endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER + }, + .rename2 = zpl_rename2, +#endif }; const struct inode_operations zpl_symlink_inode_operations = { @@ -823,10 +903,10 @@ const struct inode_operations zpl_special_inode_operations = { #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ +#if defined(HAVE_GET_INODE_ACL) + .get_inode_acl = zpl_get_acl, +#else .get_acl = zpl_get_acl, +#endif /* HAVE_GET_INODE_ACL */ #endif /* CONFIG_FS_POSIX_ACL */ }; - -dentry_operations_t zpl_dentry_operations = { - .d_revalidate = zpl_revalidate, -}; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index b18efde9b18a..d98d32c1f9fb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * Copyright (c) 2023, Datto Inc. All rights reserved. */ @@ -185,7 +186,9 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data) static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { - ZPL_ENTER(zfsvfs); + int error; + if ((error = zpl_enter(zfsvfs, FTAG)) != 0) + return (error); char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); @@ -205,7 +208,7 @@ __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); return (0); } @@ -233,6 +236,18 @@ __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) } #endif /* CONFIG_FS_POSIX_ACL */ + switch (zfsvfs->z_case) { + case ZFS_CASE_SENSITIVE: + seq_puts(seq, ",casesensitive"); + break; + case ZFS_CASE_INSENSITIVE: + seq_puts(seq, ",caseinsensitive"); + break; + default: + seq_puts(seq, ",casemixed"); + break; + } + return (0); } @@ -262,11 +277,14 @@ zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; objset_t *os = data; - - if (zfsvfs == NULL) - return (0); - - return (os == zfsvfs->z_os); + /* + * If the os doesn't match the z_os in the super_block, assume it is + * not a match. Matching would imply a multimount of a dataset. It is + * possible that during a multimount, there is a simultaneous operation + * that changes the z_os, e.g., rollback, where the match will be + * missed, but in that case the user will get an EBUSY. + */ + return (zfsvfs != NULL && os == zfsvfs->z_os); } static struct super_block * @@ -292,12 +310,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); + /* + * Recheck with the lock held to prevent mounting the wrong dataset + * since z_os can be stale when the teardown lock is held. + * + * We can't do this in zpl_test_super in since it's under spinlock and + * also s_umount lock is not held there so it would race with + * zfs_umount and zfsvfs can be freed. + */ + if (!IS_ERR(s) && s->s_fs_info != NULL) { + zfsvfs_t *zfsvfs = s->s_fs_info; + if (zpl_enter(zfsvfs, FTAG) == 0) { + if (os != zfsvfs->z_os) + err = -SET_ERROR(EBUSY); + zpl_exit(zfsvfs, FTAG); + } else { + err = -SET_ERROR(EBUSY); + } + } dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(s)) return (ERR_CAST(s)); + if (err) { + deactivate_locked_super(s); + return (ERR_PTR(err)); + } + if (s->s_root == NULL) { err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); if (err) { @@ -334,7 +375,7 @@ zpl_kill_sb(struct super_block *sb) } void -zpl_prune_sb(int64_t nr_to_scan, void *arg) +zpl_prune_sb(uint64_t nr_to_scan, void *arg) { struct super_block *sb = (struct super_block *)arg; int objects = 0; @@ -360,7 +401,11 @@ const struct super_operations zpl_super_operations = { struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, +#if defined(HAVE_IDMAP_MNT_API) + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, +#else .fs_flags = FS_USERNS_MOUNT, +#endif .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index 98378109cb9a..4e4f5210f85d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -246,8 +246,8 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) crhold(cr); cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out1; rw_enter(&zp->z_xattr_lock, RW_READER); if (zfsvfs->z_use_sa && zp->z_is_sa) { @@ -264,7 +264,8 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) out: rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); +out1: spl_fstrans_unmark(cookie); crfree(cr); @@ -435,12 +436,13 @@ zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) crhold(cr); cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out; rw_enter(&zp->z_xattr_lock, RW_READER); error = __zpl_xattr_get(ip, name, value, size, cr); rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); +out: spl_fstrans_unmark(cookie); crfree(cr); @@ -497,7 +499,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, vap->va_gid = crgetgid(cr); error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, - cr, 0, NULL); + cr, ATTR_NOACLCHECK, NULL, zfs_init_idmap); if (error) goto out; } @@ -511,7 +513,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, error = -zfs_write_simple(xzp, value, size, pos, NULL); out: if (error == 0) { - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); } @@ -604,8 +606,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, crhold(cr); cookie = spl_fstrans_mark(); - ZPL_ENTER(zfsvfs); - ZPL_VERIFY_ZP(zp); + if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + goto out1; rw_enter(&zp->z_xattr_lock, RW_WRITER); /* @@ -658,7 +660,8 @@ zpl_xattr_set(struct inode *ip, const char *name, const void *value, zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); out: rw_exit(&zp->z_xattr_lock); - ZPL_EXIT(zfsvfs); + zpl_exit(zfsvfs, FTAG); +out1: spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); @@ -735,9 +738,11 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); static int -__zpl_xattr_user_set(struct inode *ip, const char *name, +__zpl_xattr_user_set(zidmap_t *user_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { + (void) user_ns; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME @@ -843,9 +848,11 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); static int -__zpl_xattr_trusted_set(struct inode *ip, const char *name, +__zpl_xattr_trusted_set(zidmap_t *user_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { + (void) user_ns; char *xattr_name; int error; @@ -911,9 +918,11 @@ __zpl_xattr_security_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); static int -__zpl_xattr_security_set(struct inode *ip, const char *name, +__zpl_xattr_security_set(zidmap_t *user_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { + (void) user_ns; char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ @@ -937,7 +946,7 @@ zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs, int error = 0; for (xattr = xattrs; xattr->name != NULL; xattr++) { - error = __zpl_xattr_security_set(ip, + error = __zpl_xattr_security_set(NULL, ip, xattr->name, xattr->value, xattr->value_len, 0); if (error < 0) @@ -1002,7 +1011,8 @@ zpl_set_acl_impl(struct inode *ip, struct posix_acl *acl, int type) */ if (ip->i_mode != mode) { ip->i_mode = ITOZ(ip)->z_mode = mode; - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, + current_time(ip)); zfs_mark_inode_dirty(ip); } @@ -1052,11 +1062,23 @@ int #ifdef HAVE_SET_ACL_USERNS zpl_set_acl(struct user_namespace *userns, struct inode *ip, struct posix_acl *acl, int type) +#elif defined(HAVE_SET_ACL_IDMAP_DENTRY) +zpl_set_acl(struct mnt_idmap *userns, struct dentry *dentry, + struct posix_acl *acl, int type) +#elif defined(HAVE_SET_ACL_USERNS_DENTRY_ARG2) +zpl_set_acl(struct user_namespace *userns, struct dentry *dentry, + struct posix_acl *acl, int type) #else zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) #endif /* HAVE_SET_ACL_USERNS */ { +#ifdef HAVE_SET_ACL_USERNS_DENTRY_ARG2 + return (zpl_set_acl_impl(d_inode(dentry), acl, type)); +#elif defined(HAVE_SET_ACL_IDMAP_DENTRY) + return (zpl_set_acl_impl(d_inode(dentry), acl, type)); +#else return (zpl_set_acl_impl(ip, acl, type)); +#endif /* HAVE_SET_ACL_USERNS_DENTRY_ARG2 */ } #endif /* HAVE_SET_ACL */ @@ -1115,7 +1137,7 @@ zpl_get_acl_impl(struct inode *ip, int type) return (acl); } -#if defined(HAVE_GET_ACL_RCU) +#if defined(HAVE_GET_ACL_RCU) || defined(HAVE_GET_INODE_ACL) struct posix_acl * zpl_get_acl(struct inode *ip, int type, bool rcu) { @@ -1149,7 +1171,7 @@ zpl_init_acl(struct inode *ip, struct inode *dir) return (PTR_ERR(acl)); if (!acl) { ITOZ(ip)->z_mode = (ip->i_mode &= ~current_umask()); - ip->i_ctime = current_time(ip); + zpl_inode_set_ctime_to_ts(ip, current_time(ip)); zfs_mark_inode_dirty(ip); return (0); } @@ -1297,7 +1319,8 @@ __zpl_xattr_acl_get_default(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); static int -__zpl_xattr_acl_set_access(struct inode *ip, const char *name, +__zpl_xattr_acl_set_access(zidmap_t *mnt_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; @@ -1311,8 +1334,14 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) +#if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP) + if (!zpl_inode_owner_or_capable(mnt_ns, ip)) return (-EPERM); +#else + (void) mnt_ns; + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) + return (-EPERM); +#endif if (value) { acl = zpl_acl_from_xattr(value, size); @@ -1336,7 +1365,8 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); static int -__zpl_xattr_acl_set_default(struct inode *ip, const char *name, +__zpl_xattr_acl_set_default(zidmap_t *mnt_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; @@ -1350,8 +1380,14 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) +#if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP) + if (!zpl_inode_owner_or_capable(mnt_ns, ip)) return (-EPERM); +#else + (void) mnt_ns; + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) + return (-EPERM); +#endif if (value) { acl = zpl_acl_from_xattr(value, size); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index acbab55d03ef..4b960daf89ee 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -37,6 +37,7 @@ #include <sys/spa_impl.h> #include <sys/zvol.h> #include <sys/zvol_impl.h> +#include <cityhash.h> #include <linux/blkdev_compat.h> #include <linux/task_io_accounting_ops.h> @@ -53,8 +54,14 @@ static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; +/* + * Switch taskq at multiple of 512 MB offset. This can be set to a lower value + * to utilize more threads for small files but may affect prefetch hits. + */ +#define ZVOL_TASKQ_OFFSET_SHIFT 29 + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS -static const unsigned int zvol_open_timeout_ms = 1000; +static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; @@ -76,6 +83,8 @@ static boolean_t zvol_use_blk_mq = B_FALSE; static unsigned int zvol_blk_mq_blocks_per_thread = 8; #endif +static unsigned int zvol_num_taskqs = 0; + #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ @@ -114,7 +123,11 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -taskq_t *zvol_taskq; +typedef struct zv_taskq { + uint_t tqs_cnt; + taskq_t **tqs_taskq; +} zv_taskq_t; +static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; typedef struct zv_request_stack { @@ -342,8 +355,7 @@ zvol_discard(zv_request_t *zvr) struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time = 0; - - boolean_t acct = blk_queue_io_stat(q); + boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); @@ -388,7 +400,7 @@ zvol_discard(zv_request_t *zvr) if (error != 0) { dmu_tx_abort(tx); } else { - zvol_log_truncate(zv, tx, start, size, B_TRUE); + zvol_log_truncate(zv, tx, start, size); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); @@ -513,7 +525,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); - if (zvol_request_sync) + if (zvol_request_sync || zv->zv_threading == B_FALSE) force_sync = 1; zv_request_t zvr = { @@ -533,6 +545,22 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, } zv_request_task_t *task; + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t blk_mq_hw_queue = 0; + uint_t tq_idx; + uint_t taskq_hash; +#ifdef HAVE_BLK_MQ + if (rq) +#ifdef HAVE_BLK_MQ_RQ_HCTX + blk_mq_hw_queue = rq->mq_hctx->queue_num; +#else + blk_mq_hw_queue = + rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; +#endif +#endif + taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, + blk_mq_hw_queue, 0); + tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { @@ -558,7 +586,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, - zvol_get_data); + zvol_get_data, &zv->zv_kstat.dk_zil_sums); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_create_minor */ VERIFY0((zv->zv_zilog->zl_header->zh_flags & @@ -602,7 +630,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_discard_task, task, 0, &task->ent); } } else { @@ -610,7 +638,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_write(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_write_task, task, 0, &task->ent); } } @@ -632,7 +660,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_read(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_read_task, task, 0, &task->ent); } } @@ -672,7 +700,11 @@ zvol_request(struct request_queue *q, struct bio *bio) } static int +#ifdef HAVE_BLK_MODE_T +zvol_open(struct gendisk *disk, blk_mode_t flag) +#else zvol_open(struct block_device *bdev, fmode_t flag) +#endif { zvol_state_t *zv; int error = 0; @@ -687,10 +719,14 @@ retry: /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting - * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free() + * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ +#ifdef HAVE_BLK_MODE_T + zv = disk->private_data; +#else zv = bdev->bd_disk->private_data; +#endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); @@ -770,14 +806,15 @@ retry: } } - error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); + error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + if ((blk_mode_is_open_write(flag)) && + (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); @@ -792,14 +829,25 @@ retry: rw_exit(&zv->zv_suspend_lock); if (error == 0) +#ifdef HAVE_BLK_MODE_T + disk_check_media_change(disk); +#else zfs_check_media_change(bdev); +#endif return (error); } static void -zvol_release(struct gendisk *disk, fmode_t mode) +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG +zvol_release(struct gendisk *disk) +#else +zvol_release(struct gendisk *disk, fmode_t unused) +#endif { +#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) + (void) unused; +#endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; @@ -854,7 +902,13 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKFLSBUF: +#ifdef HAVE_FSYNC_BDEV fsync_bdev(bdev); +#elif defined(HAVE_SYNC_BLOCKDEV) + sync_blockdev(bdev); +#else +#error "Neither fsync_bdev() nor sync_blockdev() found" +#endif invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); @@ -1030,6 +1084,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + return (1); + } + + zso->zvo_disk = disk; + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; #else zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); if (zso->zvo_queue == NULL) @@ -1078,6 +1142,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv) } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + + zso->zvo_disk = disk; + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { @@ -1174,7 +1249,7 @@ zvol_alloc(dev_t dev, const char *name) zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; - strlcpy(zv->zv_name, name, MAXNAMELEN); + strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); @@ -1231,9 +1306,13 @@ zvol_os_free(zvol_state_t *zv) del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ - defined(HAVE_BLK_ALLOC_DISK) + (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) +#if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else + put_disk(zv->zv_zso->zvo_disk); +#endif +#else blk_cleanup_queue(zv->zv_zso->zvo_queue); put_disk(zv->zv_zso->zvo_disk); #endif @@ -1275,6 +1354,8 @@ zvol_os_create_minor(const char *name) int error = 0; int idx; uint64_t hash = zvol_name_hash(name); + uint64_t volthreading; + bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); @@ -1283,6 +1364,13 @@ zvol_os_create_minor(const char *name) if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; + if (MINOR(minor) != minor) { + /* too many partitions can cause an overflow */ + zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", + name, minor, MINOR(minor)); + ida_simple_remove(&zvol_ida, idx); + return (SET_ERROR(EINVAL)); + } zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { @@ -1320,6 +1408,12 @@ zvol_os_create_minor(const char *name) zv->zv_volsize = volsize; zv->zv_objset = os; + /* Default */ + zv->zv_threading = B_TRUE; + if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) + == 0) + zv->zv_threading = volthreading; + set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, @@ -1408,18 +1502,21 @@ zvol_os_create_minor(const char *name) blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif + ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); + if (error) + goto out_dmu_objset_disown; ASSERT3P(zv->zv_zilog, ==, NULL); - zv->zv_zilog = zil_open(os, zvol_get_data); + zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) - zil_destroy(zv->zv_zilog, B_FALSE); + replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else - zil_replay(os, zv, zvol_replay_vector); + replayed_zil = zil_replay(os, zv, zvol_replay_vector); } - zil_close(zv->zv_zilog); + if (replayed_zil) + zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); - dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); /* * When udev detects the addition of the device it will immediately @@ -1427,7 +1524,7 @@ zvol_os_create_minor(const char *name) * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ - len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); + len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, @@ -1488,6 +1585,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); + + dataset_kstats_rename(&zv->zv_kstat, newname); } void @@ -1528,8 +1627,40 @@ zvol_init(void) zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } + /* + * Use atleast 32 zvol_threads but for many core system, + * prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. + * + * taskq total + * cpus taskqs threads threads + * ------- ------- ------- ------- + * 1 1 32 32 + * 2 1 32 32 + * 4 1 32 32 + * 8 2 16 32 + * 16 3 11 33 + * 32 5 7 35 + * 64 8 8 64 + * 128 11 12 132 + * 256 16 16 256 + */ + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); + if (num_tqs == 0) { + num_tqs = 1 + num_online_cpus() / 6; + while (num_tqs * num_tqs > zvol_actual_threads) + num_tqs--; + } + uint_t per_tq_thread = zvol_actual_threads / num_tqs; + if (per_tq_thread * num_tqs < zvol_actual_threads) + per_tq_thread++; + ztqs->tqs_cnt = num_tqs; + ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } @@ -1549,11 +1680,22 @@ zvol_init(void) 1024); } #endif - zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, - zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (zvol_taskq == NULL) { - unregister_blkdev(zvol_major, ZVOL_DRIVER); - return (-ENOMEM); + for (uint_t i = 0; i < num_tqs; i++) { + char name[32]; + (void) snprintf(name, sizeof (name), "%s_tq-%u", + ZVOL_DRIVER, i); + ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, + maxclsyspri, per_tq_thread, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (ztqs->tqs_taskq[i] == NULL) { + for (int j = i - 1; j >= 0; j--) + taskq_destroy(ztqs->tqs_taskq[j]); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + return (-ENOMEM); + } } zvol_init_impl(); @@ -1564,9 +1706,22 @@ zvol_init(void) void zvol_fini(void) { + zv_taskq_t *ztqs = &zvol_taskqs; zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); - taskq_destroy(zvol_taskq); + + if (ztqs->tqs_taskq == NULL) { + ASSERT3U(ztqs->tqs_cnt, ==, 0); + } else { + for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { + ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); + taskq_destroy(ztqs->tqs_taskq[i]); + } + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + } + ida_destroy(&zvol_ida); } @@ -1587,6 +1742,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); +module_param(zvol_num_taskqs, uint, 0444); +MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); + module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); @@ -1605,4 +1763,9 @@ MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, "Process volblocksize blocks per thread"); #endif +#ifndef HAVE_BLKDEV_GET_ERESTARTSYS +module_param(zvol_open_timeout_ms, uint, 0644); +MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); +#endif + /* END CSTYLED */ |