aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/os/linux
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux')
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c36
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-generic.c258
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c8
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c3
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-math-compat.c275
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-trace.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-zone.c413
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/arc_os.c16
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c50
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c7
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c81
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c240
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c21
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c87
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c44
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c26
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c681
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c2
20 files changed, 1554 insertions, 704 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
deleted file mode 100644
index b6d967108fed..000000000000
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
- * Copyright (C) 2007 The Regents of the University of California.
- * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
- * UCRL-CODE-235197
- *
- * This file is part of the SPL, Solaris Porting Layer.
- *
- * The SPL is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * The SPL is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with the SPL. If not, see <http://www.gnu.org/licenses/>.
- *
- * Solaris Porting Layer (SPL) Atomic Implementation.
- */
-
-#include <sys/atomic.h>
-
-#ifdef ATOMIC_SPINLOCK
-/* Global atomic lock declarations */
-DEFINE_SPINLOCK(atomic32_lock);
-DEFINE_SPINLOCK(atomic64_lock);
-
-EXPORT_SYMBOL(atomic32_lock);
-EXPORT_SYMBOL(atomic64_lock);
-#endif /* ATOMIC_SPINLOCK */
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
index 89ca4a648b2f..585ad7377b49 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -197,266 +197,8 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len)
return (0);
}
-
-
EXPORT_SYMBOL(random_get_pseudo_bytes);
-#if BITS_PER_LONG == 32
-
-/*
- * Support 64/64 => 64 division on a 32-bit platform. While the kernel
- * provides a div64_u64() function for this we do not use it because the
- * implementation is flawed. There are cases which return incorrect
- * results as late as linux-2.6.35. Until this is fixed upstream the
- * spl must provide its own implementation.
- *
- * This implementation is a slightly modified version of the algorithm
- * proposed by the book 'Hacker's Delight'. The original source can be
- * found here and is available for use without restriction.
- *
- * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
- */
-
-/*
- * Calculate number of leading of zeros for a 64-bit value.
- */
-static int
-nlz64(uint64_t x)
-{
- register int n = 0;
-
- if (x == 0)
- return (64);
-
- if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
- if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
- if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; }
- if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; }
- if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; }
- if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; }
-
- return (n);
-}
-
-/*
- * Newer kernels have a div_u64() function but we define our own
- * to simplify portability between kernel versions.
- */
-static inline uint64_t
-__div_u64(uint64_t u, uint32_t v)
-{
- (void) do_div(u, v);
- return (u);
-}
-
-/*
- * Turn off missing prototypes warning for these functions. They are
- * replacements for libgcc-provided functions and will never be called
- * directly.
- */
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmissing-prototypes"
-#endif
-
-/*
- * Implementation of 64-bit unsigned division for 32-bit machines.
- *
- * First the procedure takes care of the case in which the divisor is a
- * 32-bit quantity. There are two subcases: (1) If the left half of the
- * dividend is less than the divisor, one execution of do_div() is all that
- * is required (overflow is not possible). (2) Otherwise it does two
- * divisions, using the grade school method.
- */
-uint64_t
-__udivdi3(uint64_t u, uint64_t v)
-{
- uint64_t u0, u1, v1, q0, q1, k;
- int n;
-
- if (v >> 32 == 0) { // If v < 2**32:
- if (u >> 32 < v) { // If u/v cannot overflow,
- return (__div_u64(u, v)); // just do one division.
- } else { // If u/v would overflow:
- u1 = u >> 32; // Break u into two halves.
- u0 = u & 0xFFFFFFFF;
- q1 = __div_u64(u1, v); // First quotient digit.
- k = u1 - q1 * v; // First remainder, < v.
- u0 += (k << 32);
- q0 = __div_u64(u0, v); // Seconds quotient digit.
- return ((q1 << 32) + q0);
- }
- } else { // If v >= 2**32:
- n = nlz64(v); // 0 <= n <= 31.
- v1 = (v << n) >> 32; // Normalize divisor, MSB is 1.
- u1 = u >> 1; // To ensure no overflow.
- q1 = __div_u64(u1, v1); // Get quotient from
- q0 = (q1 << n) >> 31; // Undo normalization and
- // division of u by 2.
- if (q0 != 0) // Make q0 correct or
- q0 = q0 - 1; // too small by 1.
- if ((u - q0 * v) >= v)
- q0 = q0 + 1; // Now q0 is correct.
-
- return (q0);
- }
-}
-EXPORT_SYMBOL(__udivdi3);
-
-#ifndef abs64
-/* CSTYLED */
-#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
-#endif
-
-/*
- * Implementation of 64-bit signed division for 32-bit machines.
- */
-int64_t
-__divdi3(int64_t u, int64_t v)
-{
- int64_t q, t;
- q = __udivdi3(abs64(u), abs64(v));
- t = (u ^ v) >> 63; // If u, v have different
- return ((q ^ t) - t); // signs, negate q.
-}
-EXPORT_SYMBOL(__divdi3);
-
-/*
- * Implementation of 64-bit unsigned modulo for 32-bit machines.
- */
-uint64_t
-__umoddi3(uint64_t dividend, uint64_t divisor)
-{
- return (dividend - (divisor * __udivdi3(dividend, divisor)));
-}
-EXPORT_SYMBOL(__umoddi3);
-
-/* 64-bit signed modulo for 32-bit machines. */
-int64_t
-__moddi3(int64_t n, int64_t d)
-{
- int64_t q;
- boolean_t nn = B_FALSE;
-
- if (n < 0) {
- nn = B_TRUE;
- n = -n;
- }
- if (d < 0)
- d = -d;
-
- q = __umoddi3(n, d);
-
- return (nn ? -q : q);
-}
-EXPORT_SYMBOL(__moddi3);
-
-/*
- * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
- */
-uint64_t
-__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
-{
- uint64_t q = __udivdi3(n, d);
- if (r)
- *r = n - d * q;
- return (q);
-}
-EXPORT_SYMBOL(__udivmoddi4);
-
-/*
- * Implementation of 64-bit signed division/modulo for 32-bit machines.
- */
-int64_t
-__divmoddi4(int64_t n, int64_t d, int64_t *r)
-{
- int64_t q, rr;
- boolean_t nn = B_FALSE;
- boolean_t nd = B_FALSE;
- if (n < 0) {
- nn = B_TRUE;
- n = -n;
- }
- if (d < 0) {
- nd = B_TRUE;
- d = -d;
- }
-
- q = __udivmoddi4(n, d, (uint64_t *)&rr);
-
- if (nn != nd)
- q = -q;
- if (nn)
- rr = -rr;
- if (r)
- *r = rr;
- return (q);
-}
-EXPORT_SYMBOL(__divmoddi4);
-
-#if defined(__arm) || defined(__arm__)
-/*
- * Implementation of 64-bit (un)signed division for 32-bit arm machines.
- *
- * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned)
- * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
- * and the remainder in {r2, r3}. The return type is specifically left
- * set to 'void' to ensure the compiler does not overwrite these registers
- * during the return. All results are in registers as per ABI
- */
-void
-__aeabi_uldivmod(uint64_t u, uint64_t v)
-{
- uint64_t res;
- uint64_t mod;
-
- res = __udivdi3(u, v);
- mod = __umoddi3(u, v);
- {
- register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
- register uint32_t r1 asm("r1") = (res >> 32);
- register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
- register uint32_t r3 asm("r3") = (mod >> 32);
-
- asm volatile(""
- : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */
- : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
-
- return; /* r0; */
- }
-}
-EXPORT_SYMBOL(__aeabi_uldivmod);
-
-void
-__aeabi_ldivmod(int64_t u, int64_t v)
-{
- int64_t res;
- uint64_t mod;
-
- res = __divdi3(u, v);
- mod = __umoddi3(u, v);
- {
- register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
- register uint32_t r1 asm("r1") = (res >> 32);
- register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
- register uint32_t r3 asm("r3") = (mod >> 32);
-
- asm volatile(""
- : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */
- : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
-
- return; /* r0; */
- }
-}
-EXPORT_SYMBOL(__aeabi_ldivmod);
-#endif /* __arm || __arm__ */
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-
-#endif /* BITS_PER_LONG */
-
/*
* NOTE: The strtoxx behavior is solely based on my reading of the Solaris
* ddi_strtol(9F) man page. I have not verified the behavior of these
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
index 5594b2f80c02..6d496e68511e 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -139,12 +139,10 @@ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
static void *
kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
{
- gfp_t lflags = kmem_flags_convert(flags);
+ gfp_t lflags = kmem_flags_convert(flags | KM_VMEM);
void *ptr;
- if (skc->skc_flags & KMC_RECLAIMABLE)
- lflags |= __GFP_RECLAIMABLE;
- ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+ ptr = spl_vmalloc(size, lflags);
/* Resulting allocated memory will be page aligned */
ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
@@ -424,7 +422,7 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
if (!empty)
return (-EEXIST);
- if (skc->skc_flags & KMC_RECLAIMABLE)
+ if (skc->skc_flags & KMC_RECLAIMABLE && !(flags & KM_VMEM))
lflags |= __GFP_RECLAIMABLE;
ske = kmalloc(sizeof (*ske), lflags);
if (ske == NULL)
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
index 9fe008cef868..9fe4042b5079 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
@@ -188,7 +188,7 @@ spl_kvmalloc(size_t size, gfp_t lflags)
return (ptr);
}
- return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
+ return (spl_vmalloc(size, lflags));
}
/*
@@ -237,7 +237,7 @@ spl_kmem_alloc_impl(size_t size, int flags, int node)
*/
if (size > spl_kmem_alloc_max) {
if (flags & KM_VMEM) {
- ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+ ptr = spl_vmalloc(size, lflags);
} else {
return (NULL);
}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
index 02c5b42bc4a0..154ab12e84f7 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
@@ -531,7 +531,6 @@ kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
strlcpy(kpep->kpe_module, module, sizeof (kpep->kpe_module));
strlcpy(kpep->kpe_name, name, sizeof (kpep->kpe_name));
}
-EXPORT_SYMBOL(kstat_proc_entry_init);
kstat_t *
__kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
@@ -702,7 +701,6 @@ out:
mutex_exit(&kstat_module_lock);
}
-EXPORT_SYMBOL(kstat_proc_entry_install);
void
__kstat_install(kstat_t *ksp)
@@ -739,7 +737,6 @@ kstat_proc_entry_delete(kstat_proc_entry_t *kpep)
mutex_exit(&kstat_module_lock);
}
-EXPORT_SYMBOL(kstat_proc_entry_delete);
void
__kstat_delete(kstat_t *ksp)
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-math-compat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-math-compat.c
new file mode 100644
index 000000000000..3184db7f28b0
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-math-compat.c
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Generic Implementation.
+ */
+
+#include <sys/isa_defs.h>
+#include <sys/sysmacros.h>
+
+/*
+ * 64-bit math support for 32-bit platforms. Compilers will generatee
+ * references to the functions here if required.
+ */
+
+#if BITS_PER_LONG == 32
+
+/*
+ * Support 64/64 => 64 division on a 32-bit platform. While the kernel
+ * provides a div64_u64() function for this we do not use it because the
+ * implementation is flawed. There are cases which return incorrect
+ * results as late as linux-2.6.35. Until this is fixed upstream the
+ * spl must provide its own implementation.
+ *
+ * This implementation is a slightly modified version of the algorithm
+ * proposed by the book 'Hacker's Delight'. The original source can be
+ * found here and is available for use without restriction.
+ *
+ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
+ */
+
+/*
+ * Calculate number of leading of zeros for a 64-bit value.
+ */
+static int
+nlz64(uint64_t x)
+{
+ register int n = 0;
+
+ if (x == 0)
+ return (64);
+
+ if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
+ if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
+ if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; }
+ if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; }
+ if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; }
+ if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; }
+
+ return (n);
+}
+
+/*
+ * Newer kernels have a div_u64() function but we define our own
+ * to simplify portability between kernel versions.
+ */
+static inline uint64_t
+__div_u64(uint64_t u, uint32_t v)
+{
+ (void) do_div(u, v);
+ return (u);
+}
+
+/*
+ * Implementation of 64-bit unsigned division for 32-bit machines.
+ *
+ * First the procedure takes care of the case in which the divisor is a
+ * 32-bit quantity. There are two subcases: (1) If the left half of the
+ * dividend is less than the divisor, one execution of do_div() is all that
+ * is required (overflow is not possible). (2) Otherwise it does two
+ * divisions, using the grade school method.
+ */
+uint64_t
+__udivdi3(uint64_t u, uint64_t v)
+{
+ uint64_t u0, u1, v1, q0, q1, k;
+ int n;
+
+ if (v >> 32 == 0) { // If v < 2**32:
+ if (u >> 32 < v) { // If u/v cannot overflow,
+ return (__div_u64(u, v)); // just do one division.
+ } else { // If u/v would overflow:
+ u1 = u >> 32; // Break u into two halves.
+ u0 = u & 0xFFFFFFFF;
+ q1 = __div_u64(u1, v); // First quotient digit.
+ k = u1 - q1 * v; // First remainder, < v.
+ u0 += (k << 32);
+ q0 = __div_u64(u0, v); // Seconds quotient digit.
+ return ((q1 << 32) + q0);
+ }
+ } else { // If v >= 2**32:
+ n = nlz64(v); // 0 <= n <= 31.
+ v1 = (v << n) >> 32; // Normalize divisor, MSB is 1.
+ u1 = u >> 1; // To ensure no overflow.
+ q1 = __div_u64(u1, v1); // Get quotient from
+ q0 = (q1 << n) >> 31; // Undo normalization and
+ // division of u by 2.
+ if (q0 != 0) // Make q0 correct or
+ q0 = q0 - 1; // too small by 1.
+ if ((u - q0 * v) >= v)
+ q0 = q0 + 1; // Now q0 is correct.
+
+ return (q0);
+ }
+}
+EXPORT_SYMBOL(__udivdi3);
+
+#ifndef abs64
+/* CSTYLED */
+#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
+#endif
+
+/*
+ * Implementation of 64-bit signed division for 32-bit machines.
+ */
+int64_t
+__divdi3(int64_t u, int64_t v)
+{
+ int64_t q, t;
+ q = __udivdi3(abs64(u), abs64(v));
+ t = (u ^ v) >> 63; // If u, v have different
+ return ((q ^ t) - t); // signs, negate q.
+}
+EXPORT_SYMBOL(__divdi3);
+
+/*
+ * Implementation of 64-bit unsigned modulo for 32-bit machines.
+ */
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divisor)
+{
+ return (dividend - (divisor * __udivdi3(dividend, divisor)));
+}
+EXPORT_SYMBOL(__umoddi3);
+
+/* 64-bit signed modulo for 32-bit machines. */
+int64_t
+__moddi3(int64_t n, int64_t d)
+{
+ int64_t q;
+ boolean_t nn = B_FALSE;
+
+ if (n < 0) {
+ nn = B_TRUE;
+ n = -n;
+ }
+ if (d < 0)
+ d = -d;
+
+ q = __umoddi3(n, d);
+
+ return (nn ? -q : q);
+}
+EXPORT_SYMBOL(__moddi3);
+
+/*
+ * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
+ */
+uint64_t
+__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
+{
+ uint64_t q = __udivdi3(n, d);
+ if (r)
+ *r = n - d * q;
+ return (q);
+}
+EXPORT_SYMBOL(__udivmoddi4);
+
+/*
+ * Implementation of 64-bit signed division/modulo for 32-bit machines.
+ */
+int64_t
+__divmoddi4(int64_t n, int64_t d, int64_t *r)
+{
+ int64_t q, rr;
+ boolean_t nn = B_FALSE;
+ boolean_t nd = B_FALSE;
+ if (n < 0) {
+ nn = B_TRUE;
+ n = -n;
+ }
+ if (d < 0) {
+ nd = B_TRUE;
+ d = -d;
+ }
+
+ q = __udivmoddi4(n, d, (uint64_t *)&rr);
+
+ if (nn != nd)
+ q = -q;
+ if (nn)
+ rr = -rr;
+ if (r)
+ *r = rr;
+ return (q);
+}
+EXPORT_SYMBOL(__divmoddi4);
+
+#if defined(__arm) || defined(__arm__)
+/*
+ * Implementation of 64-bit (un)signed division for 32-bit arm machines.
+ *
+ * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned)
+ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
+ * and the remainder in {r2, r3}. The return type is specifically left
+ * set to 'void' to ensure the compiler does not overwrite these registers
+ * during the return. All results are in registers as per ABI
+ */
+void
+__aeabi_uldivmod(uint64_t u, uint64_t v)
+{
+ uint64_t res;
+ uint64_t mod;
+
+ res = __udivdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_uldivmod);
+
+void
+__aeabi_ldivmod(int64_t u, int64_t v)
+{
+ int64_t res;
+ uint64_t mod;
+
+ res = __divdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_ldivmod);
+#endif /* __arm || __arm__ */
+
+#endif /* BITS_PER_LONG */
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
index 1c984f221c7d..76ee71074cb5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
@@ -27,8 +27,6 @@
#include <sys/taskq.h>
-#ifdef _KERNEL
#define CREATE_TRACE_POINTS
#include <sys/trace.h>
#include <sys/trace_taskq.h>
-#endif
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
index b2eae5d00b10..5992957280e4 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c
@@ -59,6 +59,18 @@ typedef struct zone_dataset {
char zd_dsname[]; /* name of the member dataset */
} zone_dataset_t;
+/*
+ * UID-based dataset zoning: allows delegating datasets to all user
+ * namespaces owned by a specific UID, enabling rootless container support.
+ */
+typedef struct zone_uid_datasets {
+ struct list_head zuds_list; /* zone_uid_datasets linkage */
+ kuid_t zuds_owner; /* owner UID */
+ struct list_head zuds_datasets; /* datasets for this UID */
+} zone_uid_datasets_t;
+
+static struct list_head zone_uid_datasets;
+
#ifdef CONFIG_USER_NS
/*
@@ -138,6 +150,18 @@ zone_datasets_lookup(unsigned int nsinum)
}
#ifdef CONFIG_USER_NS
+static zone_uid_datasets_t *
+zone_uid_datasets_lookup(kuid_t owner)
+{
+ zone_uid_datasets_t *zuds;
+
+ list_for_each_entry(zuds, &zone_uid_datasets, zuds_list) {
+ if (uid_eq(zuds->zuds_owner, owner))
+ return (zuds);
+ }
+ return (NULL);
+}
+
static struct zone_dataset *
zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
{
@@ -232,6 +256,62 @@ zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
EXPORT_SYMBOL(zone_dataset_attach);
int
+zone_dataset_attach_uid(cred_t *cred, const char *dataset, uid_t owner_uid)
+{
+#ifdef CONFIG_USER_NS
+ zone_uid_datasets_t *zuds;
+ zone_dataset_t *zd;
+ int error;
+ size_t dsnamelen;
+ kuid_t kowner;
+
+ /* Only root can attach datasets to UIDs */
+ if ((error = zone_dataset_cred_check(cred)) != 0)
+ return (error);
+ if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
+ return (error);
+
+ kowner = make_kuid(current_user_ns(), owner_uid);
+ if (!uid_valid(kowner))
+ return (EINVAL);
+
+ mutex_enter(&zone_datasets_lock);
+
+ /* Find or create UID entry */
+ zuds = zone_uid_datasets_lookup(kowner);
+ if (zuds == NULL) {
+ zuds = kmem_alloc(sizeof (zone_uid_datasets_t), KM_SLEEP);
+ INIT_LIST_HEAD(&zuds->zuds_list);
+ INIT_LIST_HEAD(&zuds->zuds_datasets);
+ zuds->zuds_owner = kowner;
+ list_add_tail(&zuds->zuds_list, &zone_uid_datasets);
+ } else {
+ /* Check if dataset already attached */
+ list_for_each_entry(zd, &zuds->zuds_datasets, zd_list) {
+ if (zd->zd_dsnamelen == dsnamelen &&
+ strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) {
+ mutex_exit(&zone_datasets_lock);
+ return (EEXIST);
+ }
+ }
+ }
+
+ /* Add dataset to UID's list */
+ zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
+ zd->zd_dsnamelen = dsnamelen;
+ strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
+ INIT_LIST_HEAD(&zd->zd_list);
+ list_add_tail(&zd->zd_list, &zuds->zuds_datasets);
+
+ mutex_exit(&zone_datasets_lock);
+ return (0);
+#else
+ return (ENXIO);
+#endif /* CONFIG_USER_NS */
+}
+EXPORT_SYMBOL(zone_dataset_attach_uid);
+
+int
zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
{
#ifdef CONFIG_USER_NS
@@ -280,6 +360,217 @@ zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
}
EXPORT_SYMBOL(zone_dataset_detach);
+int
+zone_dataset_detach_uid(cred_t *cred, const char *dataset, uid_t owner_uid)
+{
+#ifdef CONFIG_USER_NS
+ zone_uid_datasets_t *zuds;
+ zone_dataset_t *zd;
+ int error;
+ size_t dsnamelen;
+ kuid_t kowner;
+
+ if ((error = zone_dataset_cred_check(cred)) != 0)
+ return (error);
+ if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
+ return (error);
+
+ kowner = make_kuid(current_user_ns(), owner_uid);
+ if (!uid_valid(kowner))
+ return (EINVAL);
+
+ mutex_enter(&zone_datasets_lock);
+
+ zuds = zone_uid_datasets_lookup(kowner);
+ if (zuds == NULL) {
+ mutex_exit(&zone_datasets_lock);
+ return (ENOENT);
+ }
+
+ /* Find and remove dataset */
+ list_for_each_entry(zd, &zuds->zuds_datasets, zd_list) {
+ if (zd->zd_dsnamelen == dsnamelen &&
+ strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) {
+ list_del(&zd->zd_list);
+ kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
+
+ /* Remove UID entry if no more datasets */
+ if (list_empty(&zuds->zuds_datasets)) {
+ list_del(&zuds->zuds_list);
+ kmem_free(zuds, sizeof (*zuds));
+ }
+
+ mutex_exit(&zone_datasets_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&zone_datasets_lock);
+ return (ENOENT);
+#else
+ return (ENXIO);
+#endif /* CONFIG_USER_NS */
+}
+EXPORT_SYMBOL(zone_dataset_detach_uid);
+
+/*
+ * Callback for looking up zoned_uid property (registered by ZFS module).
+ */
+static zone_get_zoned_uid_fn_t zone_get_zoned_uid_fn = NULL;
+
+void
+zone_register_zoned_uid_callback(zone_get_zoned_uid_fn_t fn)
+{
+ zone_get_zoned_uid_fn = fn;
+}
+EXPORT_SYMBOL(zone_register_zoned_uid_callback);
+
+void
+zone_unregister_zoned_uid_callback(void)
+{
+ zone_get_zoned_uid_fn = NULL;
+}
+EXPORT_SYMBOL(zone_unregister_zoned_uid_callback);
+
+#ifdef CONFIG_USER_NS
+/*
+ * Check if a dataset is the delegation root (has zoned_uid set locally).
+ */
+static boolean_t
+zone_dataset_is_zoned_uid_root(const char *dataset, uid_t zoned_uid)
+{
+ char *root;
+ uid_t found_uid;
+ boolean_t is_root;
+
+ if (zone_get_zoned_uid_fn == NULL)
+ return (B_FALSE);
+
+ root = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ found_uid = zone_get_zoned_uid_fn(dataset, root, MAXPATHLEN);
+ is_root = (found_uid == zoned_uid && strcmp(root, dataset) == 0);
+ kmem_free(root, MAXPATHLEN);
+ return (is_root);
+}
+#endif /* CONFIG_USER_NS */
+
+/*
+ * Core authorization check for zoned_uid write delegation.
+ */
+zone_admin_result_t
+zone_dataset_admin_check(const char *dataset, zone_uid_op_t op,
+ const char *aux_dataset)
+{
+#ifdef CONFIG_USER_NS
+ struct user_namespace *user_ns;
+ char *delegation_root;
+ uid_t zoned_uid, ns_owner_uid;
+ int write_unused;
+ zone_admin_result_t result = ZONE_ADMIN_NOT_APPLICABLE;
+
+ /* Step 1: If in global zone, not applicable */
+ if (INGLOBALZONE(curproc))
+ return (ZONE_ADMIN_NOT_APPLICABLE);
+
+ /* Step 2: Need callback to be registered */
+ if (zone_get_zoned_uid_fn == NULL)
+ return (ZONE_ADMIN_NOT_APPLICABLE);
+
+ delegation_root = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ /* Step 3: Find delegation root */
+ zoned_uid = zone_get_zoned_uid_fn(dataset, delegation_root,
+ MAXPATHLEN);
+ if (zoned_uid == 0)
+ goto out;
+
+ /* Step 4: Verify namespace owner matches */
+ user_ns = current_user_ns();
+ ns_owner_uid = from_kuid(&init_user_ns, user_ns->owner);
+ if (ns_owner_uid != zoned_uid)
+ goto out;
+
+ /* Step 5: Tiered capability check based on operation class */
+ {
+ int required_cap;
+ switch (op) {
+ case ZONE_OP_DESTROY:
+ case ZONE_OP_RENAME:
+ case ZONE_OP_CLONE:
+ required_cap = CAP_SYS_ADMIN;
+ break;
+ case ZONE_OP_CREATE:
+ case ZONE_OP_SNAPSHOT:
+ case ZONE_OP_SETPROP:
+ required_cap = CAP_FOWNER;
+ break;
+ default:
+ required_cap = CAP_SYS_ADMIN;
+ break;
+ }
+ if (!ns_capable(user_ns, required_cap)) {
+ result = ZONE_ADMIN_DENIED;
+ goto out;
+ }
+ }
+
+ /* Step 6: Operation-specific constraints */
+ switch (op) {
+ case ZONE_OP_DESTROY:
+ /* Cannot destroy the delegation root itself */
+ if (zone_dataset_is_zoned_uid_root(dataset, zoned_uid)) {
+ result = ZONE_ADMIN_DENIED;
+ goto out;
+ }
+ break;
+
+ case ZONE_OP_RENAME:
+ /* Cannot rename outside delegation subtree */
+ if (aux_dataset != NULL) {
+ char *dst_root;
+ uid_t dst_uid;
+
+ dst_root = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ dst_uid = zone_get_zoned_uid_fn(aux_dataset,
+ dst_root, MAXPATHLEN);
+ if (dst_uid != zoned_uid ||
+ strcmp(dst_root, delegation_root) != 0) {
+ kmem_free(dst_root, MAXPATHLEN);
+ result = ZONE_ADMIN_DENIED;
+ goto out;
+ }
+ kmem_free(dst_root, MAXPATHLEN);
+ }
+ break;
+
+ case ZONE_OP_CLONE:
+ /* Clone source must be visible */
+ if (aux_dataset != NULL) {
+ if (!zone_dataset_visible(aux_dataset, &write_unused)) {
+ result = ZONE_ADMIN_DENIED;
+ goto out;
+ }
+ }
+ break;
+
+ case ZONE_OP_CREATE:
+ case ZONE_OP_SNAPSHOT:
+ case ZONE_OP_SETPROP:
+ /* No additional constraints */
+ break;
+ }
+
+ result = ZONE_ADMIN_ALLOWED;
+out:
+ kmem_free(delegation_root, MAXPATHLEN);
+ return (result);
+#else
+ (void) dataset, (void) op, (void) aux_dataset;
+ return (ZONE_ADMIN_NOT_APPLICABLE);
+#endif
+}
+EXPORT_SYMBOL(zone_dataset_admin_check);
+
/*
* A dataset is visible if:
* - It is a parent of a namespace entry.
@@ -293,34 +584,19 @@ EXPORT_SYMBOL(zone_dataset_detach);
* The parent datasets of namespace entries are visible and
* read-only to provide a path back to the root of the pool.
*/
-int
-zone_dataset_visible(const char *dataset, int *write)
+/*
+ * Helper function to check if a dataset matches against a list of
+ * delegated datasets. Returns visibility and sets write permission.
+ */
+static int
+zone_dataset_check_list(struct list_head *datasets, const char *dataset,
+ size_t dsnamelen, int *write)
{
- zone_datasets_t *zds;
zone_dataset_t *zd;
- size_t dsnamelen, zd_len;
- int visible;
-
- /* Default to read-only, in case visible is returned. */
- if (write != NULL)
- *write = 0;
- if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
- return (0);
- if (INGLOBALZONE(curproc)) {
- if (write != NULL)
- *write = 1;
- return (1);
- }
+ size_t zd_len;
+ int visible = 0;
- mutex_enter(&zone_datasets_lock);
- zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
- if (zds == NULL) {
- mutex_exit(&zone_datasets_lock);
- return (0);
- }
-
- visible = 0;
- list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
+ list_for_each_entry(zd, datasets, zd_list) {
zd_len = strlen(zd->zd_dsname);
if (zd_len > dsnamelen) {
/*
@@ -352,7 +628,8 @@ zone_dataset_visible(const char *dataset, int *write)
* the namespace entry.
*/
visible = memcmp(zd->zd_dsname, dataset,
- zd_len) == 0 && dataset[zd_len] == '/';
+ zd_len) == 0 && (dataset[zd_len] == '/' ||
+ dataset[zd_len] == '@' || dataset[zd_len] == '#');
if (visible) {
if (write != NULL)
*write = 1;
@@ -361,9 +638,70 @@ zone_dataset_visible(const char *dataset, int *write)
}
}
- mutex_exit(&zone_datasets_lock);
return (visible);
}
+
+#if defined(CONFIG_USER_NS)
+/*
+ * Check UID-based zoning visibility for the current process.
+ * Must be called with zone_datasets_lock held.
+ */
+static int
+zone_dataset_visible_uid(const char *dataset, size_t dsnamelen, int *write)
+{
+ zone_uid_datasets_t *zuds;
+
+ zuds = zone_uid_datasets_lookup(curproc->cred->user_ns->owner);
+ if (zuds != NULL)
+ return (zone_dataset_check_list(&zuds->zuds_datasets, dataset,
+ dsnamelen, write));
+ return (0);
+}
+#endif
+
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+ zone_datasets_t *zds;
+ size_t dsnamelen;
+ int visible;
+
+ /* Default to read-only, in case visible is returned. */
+ if (write != NULL)
+ *write = 0;
+ if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
+ return (0);
+ if (INGLOBALZONE(curproc)) {
+ if (write != NULL)
+ *write = 1;
+ return (1);
+ }
+
+ mutex_enter(&zone_datasets_lock);
+
+ /* First, check namespace-specific zoning (existing behavior) */
+ zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
+ if (zds != NULL) {
+ visible = zone_dataset_check_list(&zds->zds_datasets, dataset,
+ dsnamelen, write);
+ if (visible) {
+ mutex_exit(&zone_datasets_lock);
+ return (visible);
+ }
+ }
+
+ /* Second, check UID-based zoning */
+#if defined(CONFIG_USER_NS)
+ visible = zone_dataset_visible_uid(dataset, dsnamelen, write);
+ if (visible) {
+ mutex_exit(&zone_datasets_lock);
+ return (visible);
+ }
+#endif
+
+ mutex_exit(&zone_datasets_lock);
+ return (0);
+}
EXPORT_SYMBOL(zone_dataset_visible);
unsigned int
@@ -395,8 +733,9 @@ EXPORT_SYMBOL(crgetzoneid);
boolean_t
inglobalzone(proc_t *proc)
{
+ (void) proc;
#if defined(CONFIG_USER_NS)
- return (proc->cred->user_ns == &init_user_ns);
+ return (current_user_ns() == &init_user_ns);
#else
return (B_TRUE);
#endif
@@ -408,6 +747,7 @@ spl_zone_init(void)
{
mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
INIT_LIST_HEAD(&zone_datasets);
+ INIT_LIST_HEAD(&zone_uid_datasets);
return (0);
}
@@ -415,6 +755,7 @@ void
spl_zone_fini(void)
{
zone_datasets_t *zds;
+ zone_uid_datasets_t *zuds;
zone_dataset_t *zd;
/*
@@ -423,6 +764,22 @@ spl_zone_fini(void)
* namespace is destroyed, just do it here, since spl is about to go
* out of context.
*/
+
+ /* Clean up UID-based delegations */
+ while (!list_empty(&zone_uid_datasets)) {
+ zuds = list_entry(zone_uid_datasets.next,
+ zone_uid_datasets_t, zuds_list);
+ while (!list_empty(&zuds->zuds_datasets)) {
+ zd = list_entry(zuds->zuds_datasets.next,
+ zone_dataset_t, zd_list);
+ list_del(&zd->zd_list);
+ kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
+ }
+ list_del(&zuds->zuds_list);
+ kmem_free(zuds, sizeof (*zuds));
+ }
+
+ /* Clean up namespace-based delegations */
while (!list_empty(&zone_datasets)) {
zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
while (!list_empty(&zds->zds_datasets)) {
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
index 6478c834b7a5..dbc9aad936bf 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -410,6 +410,22 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
return (0);
}
+int
+param_set_l2arc_dwpd_limit(const char *buf, zfs_kernel_param_t *kp)
+{
+ uint64_t old_val = l2arc_dwpd_limit;
+ int error;
+
+ error = spl_param_set_u64(buf, kp);
+ if (error < 0)
+ return (SET_ERROR(error));
+
+ if (l2arc_dwpd_limit != old_val)
+ l2arc_dwpd_bump_reset();
+
+ return (0);
+}
+
#ifdef CONFIG_MEMORY_HOTPLUG
static int
arc_hotplug_callback(struct notifier_block *self, unsigned long action,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
index d6323fd56a8f..91010bdf642a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
@@ -39,8 +39,10 @@
#include <sys/dsl_prop.h>
#include <sys/fm/util.h>
#include <sys/dsl_scan.h>
+#include <sys/dmu.h>
#include <sys/fs/zfs.h>
#include <sys/kstat.h>
+#include <sys/zone.h>
#include "zfs_prop.h"
@@ -122,16 +124,60 @@ spa_history_zone(void)
return ("linux");
}
+static int
+spa_restore_zoned_uid_cb(const char *dsname, void *arg)
+{
+ (void) arg;
+ uint64_t zoned_uid = 0;
+
+ if (dsl_prop_get(dsname, "zoned_uid", 8, 1, &zoned_uid, NULL) != 0)
+ return (0);
+
+ if (zoned_uid != 0) {
+ int err = zone_dataset_attach_uid(kcred, dsname,
+ (uid_t)zoned_uid);
+ if (err != 0 && err != EEXIST) {
+ cmn_err(CE_WARN, "failed to restore zoned_uid for "
+ "'%s' (uid %llu): %d", dsname,
+ (unsigned long long)zoned_uid, err);
+ }
+ }
+ return (0);
+}
+
void
spa_import_os(spa_t *spa)
{
- (void) spa;
+ (void) dmu_objset_find(spa_name(spa),
+ spa_restore_zoned_uid_cb, NULL, DS_FIND_CHILDREN);
+}
+
+static int
+spa_cleanup_zoned_uid_cb(const char *dsname, void *arg)
+{
+ (void) arg;
+ uint64_t zoned_uid = 0;
+
+ if (dsl_prop_get(dsname, "zoned_uid", 8, 1, &zoned_uid, NULL) != 0)
+ return (0);
+
+ if (zoned_uid != 0) {
+ int err = zone_dataset_detach_uid(kcred, dsname,
+ (uid_t)zoned_uid);
+ if (err != 0 && err != ENOENT) {
+ cmn_err(CE_WARN, "failed to detach zoned_uid for "
+ "'%s' (uid %llu): %d", dsname,
+ (unsigned long long)zoned_uid, err);
+ }
+ }
+ return (0);
}
void
spa_export_os(spa_t *spa)
{
- (void) spa;
+ (void) dmu_objset_find(spa_name(spa),
+ spa_cleanup_zoned_uid_cb, NULL, DS_FIND_CHILDREN);
}
void
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index 1bd3500e9f66..66e10584ab5e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -445,7 +445,14 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
v->vdev_has_securetrim = bdev_secure_discard_supported(bdev);
/* Inform the ZIO pipeline that we are non-rotational */
+#ifdef HAVE_BLK_QUEUE_ROT
+ v->vdev_nonrot = !blk_queue_rot(bdev_get_queue(bdev));
+#else
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
+#endif
+
+ /* Is backed by a block device. */
+ v->vdev_is_blkdev = B_TRUE;
/* Physical volume size in bytes for the partition */
*psize = bdev_capacity(bdev);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index 79fd8911102d..c73ef86df4dc 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -251,16 +251,7 @@ snapentry_compare_by_name(const void *a, const void *b)
{
const zfs_snapentry_t *se_a = a;
const zfs_snapentry_t *se_b = b;
- int ret;
-
- ret = strcmp(se_a->se_name, se_b->se_name);
-
- if (ret < 0)
- return (-1);
- else if (ret > 0)
- return (1);
- else
- return (0);
+ return (TREE_ISIGN(strcmp(se_a->se_name, se_b->se_name)));
}
/*
@@ -272,15 +263,10 @@ snapentry_compare_by_objsetid(const void *a, const void *b)
const zfs_snapentry_t *se_a = a;
const zfs_snapentry_t *se_b = b;
- if (se_a->se_spa != se_b->se_spa)
- return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
-
- if (se_a->se_objsetid < se_b->se_objsetid)
- return (-1);
- else if (se_a->se_objsetid > se_b->se_objsetid)
- return (1);
- else
- return (0);
+ int cmp = TREE_PCMP(se_a->se_spa, se_b->se_spa);
+ if (cmp != 0)
+ return (cmp);
+ return (TREE_CMP(se_a->se_objsetid, se_b->se_objsetid));
}
/*
@@ -1201,8 +1187,10 @@ zfsctl_snapshot_mount(struct path *path, int flags)
error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
ZFS_MAX_DATASET_NAME_LEN, full_name);
- if (error)
+ if (error) {
+ zfs_exit(zfsvfs, FTAG);
goto error;
+ }
if (is_current_chrooted() == 0) {
/*
@@ -1220,6 +1208,7 @@ zfsctl_snapshot_mount(struct path *path, int flags)
error = get_root_path(&mnt_path, m, MAXPATHLEN);
if (error != 0) {
kmem_free(m, MAXPATHLEN);
+ zfs_exit(zfsvfs, FTAG);
goto error;
}
mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);
@@ -1253,6 +1242,33 @@ zfsctl_snapshot_mount(struct path *path, int flags)
zfs_snapshot_no_setuid ? "nosuid" : "suid");
/*
+ * Release z_teardown_lock before potentially blocking operations
+ * (cv_wait for concurrent mounts, call_usermodehelper for the mount
+ * helper). Holding z_teardown_lock(R) across call_usermodehelper
+ * deadlocks with namespace_sem: the mount helper needs
+ * namespace_sem(W) via move_mount, while /proc/self/mountinfo
+ * readers hold namespace_sem(R) and need z_teardown_lock(R) via
+ * zpl_show_devname. A concurrent zfs_suspend_fs queuing
+ * z_teardown_lock(W) blocks new readers, completing the cycle.
+ * See https://github.com/openzfs/zfs/issues/18409
+ *
+ * Releasing the lock allows zfs_suspend_fs to proceed during
+ * the mount, so dmu_objset_hold in zpl_get_tree can transiently
+ * fail with ENOENT during the clone swap. The mount helper
+ * fails, this function returns EISDIR, and the VFS silently
+ * falls back to the ctldir stub (empty directory). The caller
+ * gets the stub inode instead of the real snapshot root until
+ * the next access retries the automount.
+ *
+ * Safe because everything below operates on local string copies
+ * (full_name, full_path) or uses its own synchronization
+ * (zfs_snapshot_lock, se_mtx). The parent zfsvfs pointer
+ * remains valid because we hold a path reference to the
+ * automount trigger dentry.
+ */
+ zfs_exit(zfsvfs, FTAG);
+
+ /*
* Check if snapshot is already being mounted. If found, wait for
* pending mount to complete before returning success.
*/
@@ -1366,8 +1382,7 @@ zfsctl_snapshot_mount(struct path *path, int flags)
error:
kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
kmem_free(full_path, MAXPATHLEN);
-
- zfs_exit(zfsvfs, FTAG);
+ kmem_free(options, 7);
return (error);
}
@@ -1379,17 +1394,31 @@ int
zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
struct inode **ipp)
{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
int error;
struct path path;
char *mnt;
struct dentry *dentry;
+ zfs_snapentry_t *se;
mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
- MAXPATHLEN, mnt);
- if (error)
- goto out;
+ /*
+ * Try the in-memory AVL tree first for previously mounted
+ * snapshots, falling back to the on-disk scan if not found.
+ */
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ se = zfsctl_snapshot_find_by_objsetid(zfsvfs->z_os->os_spa, objsetid);
+ rw_exit(&zfs_snapshot_lock);
+ if (se != NULL) {
+ strlcpy(mnt, se->se_path, MAXPATHLEN);
+ zfsctl_snapshot_rele(se);
+ } else {
+ error = zfsctl_snapshot_path_objset(zfsvfs, objsetid,
+ MAXPATHLEN, mnt);
+ if (error)
+ goto out;
+ }
/* Trigger automount */
error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
index 5421a441b323..ce6092be1da7 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
@@ -170,6 +170,8 @@ zfs_ioc_userns_attach(zfs_cmd_t *zc)
*/
if (error == ENOTTY)
error = ZFS_ERR_NOT_USER_NAMESPACE;
+ if (error == ENXIO)
+ error = ZFS_ERR_NO_USER_NS_SUPPORT;
return (error);
}
@@ -190,6 +192,8 @@ zfs_ioc_userns_detach(zfs_cmd_t *zc)
*/
if (error == ENOTTY)
error = ZFS_ERR_NOT_USER_NAMESPACE;
+ if (error == ENXIO)
+ error = ZFS_ERR_NO_USER_NS_SUPPORT;
return (error);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index 8a7d14ab6119..9c0d92551843 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2026, TrueNAS.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -64,53 +65,15 @@
#include <linux/fs.h>
#include "zfs_comutil.h"
-enum {
- TOKEN_RO,
- TOKEN_RW,
- TOKEN_SETUID,
- TOKEN_NOSETUID,
- TOKEN_EXEC,
- TOKEN_NOEXEC,
- TOKEN_DEVICES,
- TOKEN_NODEVICES,
- TOKEN_DIRXATTR,
- TOKEN_SAXATTR,
- TOKEN_XATTR,
- TOKEN_NOXATTR,
- TOKEN_ATIME,
- TOKEN_NOATIME,
- TOKEN_RELATIME,
- TOKEN_NORELATIME,
- TOKEN_NBMAND,
- TOKEN_NONBMAND,
- TOKEN_MNTPOINT,
- TOKEN_LAST,
-};
-
-static const match_table_t zpl_tokens = {
- { TOKEN_RO, MNTOPT_RO },
- { TOKEN_RW, MNTOPT_RW },
- { TOKEN_SETUID, MNTOPT_SETUID },
- { TOKEN_NOSETUID, MNTOPT_NOSETUID },
- { TOKEN_EXEC, MNTOPT_EXEC },
- { TOKEN_NOEXEC, MNTOPT_NOEXEC },
- { TOKEN_DEVICES, MNTOPT_DEVICES },
- { TOKEN_NODEVICES, MNTOPT_NODEVICES },
- { TOKEN_DIRXATTR, MNTOPT_DIRXATTR },
- { TOKEN_SAXATTR, MNTOPT_SAXATTR },
- { TOKEN_XATTR, MNTOPT_XATTR },
- { TOKEN_NOXATTR, MNTOPT_NOXATTR },
- { TOKEN_ATIME, MNTOPT_ATIME },
- { TOKEN_NOATIME, MNTOPT_NOATIME },
- { TOKEN_RELATIME, MNTOPT_RELATIME },
- { TOKEN_NORELATIME, MNTOPT_NORELATIME },
- { TOKEN_NBMAND, MNTOPT_NBMAND },
- { TOKEN_NONBMAND, MNTOPT_NONBMAND },
- { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" },
- { TOKEN_LAST, NULL },
-};
+vfs_t *
+zfsvfs_vfs_alloc(void)
+{
+ vfs_t *vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
+ mutex_init(&vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (vfsp);
+}
-static void
+void
zfsvfs_vfs_free(vfs_t *vfsp)
{
if (vfsp != NULL) {
@@ -121,139 +84,6 @@ zfsvfs_vfs_free(vfs_t *vfsp)
}
}
-static int
-zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp)
-{
- switch (token) {
- case TOKEN_RO:
- vfsp->vfs_readonly = B_TRUE;
- vfsp->vfs_do_readonly = B_TRUE;
- break;
- case TOKEN_RW:
- vfsp->vfs_readonly = B_FALSE;
- vfsp->vfs_do_readonly = B_TRUE;
- break;
- case TOKEN_SETUID:
- vfsp->vfs_setuid = B_TRUE;
- vfsp->vfs_do_setuid = B_TRUE;
- break;
- case TOKEN_NOSETUID:
- vfsp->vfs_setuid = B_FALSE;
- vfsp->vfs_do_setuid = B_TRUE;
- break;
- case TOKEN_EXEC:
- vfsp->vfs_exec = B_TRUE;
- vfsp->vfs_do_exec = B_TRUE;
- break;
- case TOKEN_NOEXEC:
- vfsp->vfs_exec = B_FALSE;
- vfsp->vfs_do_exec = B_TRUE;
- break;
- case TOKEN_DEVICES:
- vfsp->vfs_devices = B_TRUE;
- vfsp->vfs_do_devices = B_TRUE;
- break;
- case TOKEN_NODEVICES:
- vfsp->vfs_devices = B_FALSE;
- vfsp->vfs_do_devices = B_TRUE;
- break;
- case TOKEN_DIRXATTR:
- vfsp->vfs_xattr = ZFS_XATTR_DIR;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_SAXATTR:
- vfsp->vfs_xattr = ZFS_XATTR_SA;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_XATTR:
- vfsp->vfs_xattr = ZFS_XATTR_SA;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_NOXATTR:
- vfsp->vfs_xattr = ZFS_XATTR_OFF;
- vfsp->vfs_do_xattr = B_TRUE;
- break;
- case TOKEN_ATIME:
- vfsp->vfs_atime = B_TRUE;
- vfsp->vfs_do_atime = B_TRUE;
- break;
- case TOKEN_NOATIME:
- vfsp->vfs_atime = B_FALSE;
- vfsp->vfs_do_atime = B_TRUE;
- break;
- case TOKEN_RELATIME:
- vfsp->vfs_relatime = B_TRUE;
- vfsp->vfs_do_relatime = B_TRUE;
- break;
- case TOKEN_NORELATIME:
- vfsp->vfs_relatime = B_FALSE;
- vfsp->vfs_do_relatime = B_TRUE;
- break;
- case TOKEN_NBMAND:
- vfsp->vfs_nbmand = B_TRUE;
- vfsp->vfs_do_nbmand = B_TRUE;
- break;
- case TOKEN_NONBMAND:
- vfsp->vfs_nbmand = B_FALSE;
- vfsp->vfs_do_nbmand = B_TRUE;
- break;
- case TOKEN_MNTPOINT:
- if (vfsp->vfs_mntpoint != NULL)
- kmem_strfree(vfsp->vfs_mntpoint);
- vfsp->vfs_mntpoint = match_strdup(&args[0]);
- if (vfsp->vfs_mntpoint == NULL)
- return (SET_ERROR(ENOMEM));
- break;
- default:
- break;
- }
-
- return (0);
-}
-
-/*
- * Parse the raw mntopts and return a vfs_t describing the options.
- */
-static int
-zfsvfs_parse_options(char *mntopts, vfs_t **vfsp)
-{
- vfs_t *tmp_vfsp;
- int error;
-
- tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
- mutex_init(&tmp_vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL);
-
- if (mntopts != NULL) {
- substring_t args[MAX_OPT_ARGS];
- char *tmp_mntopts, *p, *t;
- int token;
-
- tmp_mntopts = t = kmem_strdup(mntopts);
- if (tmp_mntopts == NULL)
- return (SET_ERROR(ENOMEM));
-
- while ((p = strsep(&t, ",")) != NULL) {
- if (!*p)
- continue;
-
- args[0].to = args[0].from = NULL;
- token = match_token(p, zpl_tokens, args);
- error = zfsvfs_parse_option(p, token, args, tmp_vfsp);
- if (error) {
- kmem_strfree(tmp_mntopts);
- zfsvfs_vfs_free(tmp_vfsp);
- return (error);
- }
- }
-
- kmem_strfree(tmp_mntopts);
- }
-
- *vfsp = tmp_vfsp;
-
- return (0);
-}
-
boolean_t
zfs_is_readonly(zfsvfs_t *zfsvfs)
{
@@ -1486,20 +1316,16 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
static atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0);
int
-zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
+zfs_domount(struct super_block *sb, const char *osname,
+ vfs_t *vfs, int silent)
{
- const char *osname = zm->mnt_osname;
struct inode *root_inode = NULL;
uint64_t recordsize;
int error = 0;
zfsvfs_t *zfsvfs = NULL;
- vfs_t *vfs = NULL;
int canwrite;
int dataset_visible_zone;
- ASSERT(zm);
- ASSERT(osname);
-
dataset_visible_zone = zone_dataset_visible(osname, &canwrite);
/*
@@ -1511,10 +1337,6 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
return (SET_ERROR(EPERM));
}
- error = zfsvfs_parse_options(zm->mnt_data, &vfs);
- if (error)
- return (error);
-
/*
* If a non-writable filesystem is being mounted without the
* read-only flag, pretend it was set, as done for snapshots.
@@ -1523,16 +1345,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
vfs->vfs_readonly = B_TRUE;
error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
- if (error) {
- zfsvfs_vfs_free(vfs);
+ if (error)
goto out;
- }
if ((error = dsl_prop_get_integer(osname, "recordsize",
- &recordsize, NULL))) {
- zfsvfs_vfs_free(vfs);
+ &recordsize, NULL)))
goto out;
- }
vfs->vfs_data = zfsvfs;
zfsvfs->z_vfs = vfs;
@@ -1614,6 +1432,13 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
out:
if (error) {
if (zfsvfs != NULL) {
+ /*
+ * We're returning error, so the caller still owns
+ * the mount options vfs_t. Remove them from zfsvfs
+ * so we don't try to free them.
+ */
+ zfsvfs->z_vfs = NULL;
+
dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
zfsvfs_free(zfsvfs);
}
@@ -1704,24 +1529,16 @@ zfs_umount(struct super_block *sb)
}
int
-zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm)
+zfs_remount(struct super_block *sb, vfs_t *vfsp, int flags)
{
zfsvfs_t *zfsvfs = sb->s_fs_info;
- vfs_t *vfsp;
boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os);
- int error;
if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) &&
- !(*flags & SB_RDONLY)) {
- *flags |= SB_RDONLY;
+ !(flags & SB_RDONLY))
return (EROFS);
- }
- error = zfsvfs_parse_options(zm->mnt_data, &vfsp);
- if (error)
- return (error);
-
- if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY))
+ if (!zfs_is_readonly(zfsvfs) && (flags & SB_RDONLY))
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
zfs_unregister_callbacks(zfsvfs);
@@ -1732,7 +1549,7 @@ zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm)
if (!issnap)
(void) zfs_register_callbacks(vfsp);
- return (error);
+ return (0);
}
int
@@ -1963,15 +1780,6 @@ bail:
/* release the VFS ops */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
-
- if (err != 0) {
- /*
- * Since we couldn't setup the sa framework, try to force
- * unmount this file system.
- */
- if (zfsvfs->z_os)
- (void) zfs_umount(zfsvfs->z_sb);
- }
return (err);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index a788e3fd4862..e65f81230124 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -200,8 +200,9 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
* Keep a count of the synchronous opens in the znode. On first
* synchronous open we must convert all previous async transactions
* into sync to keep correct ordering.
+ * Skip it for snapshot, as it won't have any transactions.
*/
- if (flag & O_SYNC) {
+ if (!zfsvfs->z_issnap && (flag & O_SYNC)) {
if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
zil_async_to_sync(zfsvfs->z_log, zp->z_id);
}
@@ -222,7 +223,7 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
return (error);
/* Decrement the synchronous opens in the znode */
- if (flag & O_SYNC)
+ if (!zfsvfs->z_issnap && (flag & O_SYNC))
atomic_dec_32(&zp->z_sync_cnt);
zfs_exit(zfsvfs, FTAG);
@@ -2581,8 +2582,19 @@ top:
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- if (mask != 0)
+ if (mask != 0) {
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+ /*
+ * Ensure that the z_seq is always incremented on setattr
+ * operation. This is required for change accounting for
+ * NFS clients.
+ *
+ * ATTR_MODE already increments via zfs_acl_chmod_setattr.
+ * ATTR_SIZE already increments via zfs_freesp.
+ */
+ if (!(mask & (ATTR_MODE | ATTR_SIZE)))
+ zp->z_seq++;
+ }
mutex_exit(&zp->z_lock);
if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
@@ -3513,7 +3525,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
boolean_t is_tmpfile = 0;
uint64_t txg;
- is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
+ is_tmpfile = (sip->i_nlink == 0 &&
+ (inode_state_read_once(sip) & I_LINKABLE));
ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
index 711da151f65e..0568bb63c75e 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
@@ -37,8 +37,8 @@ zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
{
fstrans_cookie_t cookie;
ushort_t empty_fid = 0;
- fid_t *fid;
- int len_bytes, rc;
+ fid_t *fid, *pfid;
+ int len_bytes, required_len, parent_len, rc, prc, fh_type;
len_bytes = *max_len * sizeof (__u32);
@@ -56,11 +56,44 @@ zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
else
rc = zfs_fid(ip, fid);
+ required_len = offsetof(fid_t, fid_data) + fid->fid_len;
+
+ /*
+ * Kernel has requested that the resulting file handle contain
+ * a reference to the provided parent. This typically would happen
+ * if the NFS export has subtree checking enabled.
+ */
+ if (parent != NULL) {
+ if ((rc == 0) && (len_bytes >
+ required_len + offsetof(fid_t, fid_data))) {
+ parent_len = len_bytes - required_len;
+ pfid = (fid_t *)((char *)fh + required_len);
+ pfid->fid_len = parent_len - offsetof(fid_t, fid_data);
+ } else {
+ empty_fid = 0;
+ pfid = (fid_t *)&empty_fid;
+ }
+
+ if (zfsctl_is_node(parent))
+ prc = zfsctl_fid(parent, pfid);
+ else
+ prc = zfs_fid(parent, pfid);
+
+ if (rc == 0 && prc != 0)
+ rc = prc;
+
+ required_len += offsetof(fid_t, fid_data) +
+ pfid->fid_len;
+ fh_type = FILEID_INO32_GEN_PARENT;
+ } else {
+ fh_type = FILEID_INO32_GEN;
+ }
+
spl_fstrans_unmark(cookie);
- len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
- *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
- return (rc == 0 ? FILEID_INO32_GEN : 255);
+ *max_len = roundup(required_len, sizeof (__u32)) / sizeof (__u32);
+
+ return (rc == 0 ? fh_type : FILEID_INVALID);
}
static struct dentry *
@@ -74,7 +107,8 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
len_bytes = fh_len * sizeof (__u32);
- if (fh_type != FILEID_INO32_GEN ||
+ if ((fh_type != FILEID_INO32_GEN &&
+ fh_type != FILEID_INO32_GEN_PARENT) ||
len_bytes < offsetof(fid_t, fid_data) ||
len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
return (ERR_PTR(-EINVAL));
@@ -104,6 +138,46 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
return (d_obtain_alias(ip));
}
+static struct dentry *
+zpl_fh_to_parent(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ /*
+ * Convert the provided struct fid to a dentry for the parent
+ * This is possible only if it was created with the parent,
+ * e.g. type is FILEID_INO32_GEN_PARENT. When this type of
+ * filehandle is created we simply pack the parent fid_t
+ * after the entry's fid_t. So this function will adjust
+ * offset in the provided buffer to the begining of the
+ * parent fid_t and call zpl_fh_to_dentry() on it.
+ */
+ fid_t *fid = (fid_t *)fh;
+ fid_t *pfid;
+ int len_bytes, parent_len_bytes, child_fid_bytes, parent_fh_len;
+
+ len_bytes = fh_len * sizeof (__u32);
+
+ if ((fh_type != FILEID_INO32_GEN_PARENT) ||
+ len_bytes < offsetof(fid_t, fid_data) ||
+ len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
+ return (ERR_PTR(-EINVAL));
+
+ child_fid_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
+ parent_len_bytes = len_bytes - child_fid_bytes;
+
+ if (parent_len_bytes < offsetof(fid_t, fid_data))
+ return (ERR_PTR(-EINVAL));
+
+ pfid = (fid_t *)((char *)fh + child_fid_bytes);
+
+ if (parent_len_bytes < offsetof(fid_t, fid_data) + pfid->fid_len)
+ return (ERR_PTR(-EINVAL));
+
+ parent_fh_len = parent_len_bytes / sizeof (__u32);
+ return (zpl_fh_to_dentry(sb, (struct fid *)pfid, parent_fh_len,
+ FILEID_INO32_GEN));
+}
+
/*
* In case the filesystem contains name longer than 255, we need to override
* the default get_name so we don't get buffer overflow. Unfortunately, since
@@ -177,6 +251,7 @@ zpl_commit_metadata(struct inode *inode)
const struct export_operations zpl_export_operations = {
.encode_fh = zpl_encode_fh,
.fh_to_dentry = zpl_fh_to_dentry,
+ .fh_to_parent = zpl_fh_to_parent,
.get_name = zpl_get_name,
.get_parent = zpl_get_parent,
.commit_metadata = zpl_commit_metadata,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index f7691c02d163..ffe227796f0a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -43,6 +43,9 @@
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
#include <linux/writeback.h>
#endif
+#ifdef HAVE_FILELOCK_HEADER
+#include <linux/filelock.h>
+#endif
/*
* When using fallocate(2) to preallocate space, inflate the requested
@@ -776,34 +779,23 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);
- switch (advice) {
- case POSIX_FADV_SEQUENTIAL:
- case POSIX_FADV_WILLNEED:
+ if (advice == POSIX_FADV_WILLNEED) {
+ loff_t rlen = len ? len : i_size_read(ip) - offset;
+ dmu_prefetch(os, zp->z_id, 0, offset, rlen,
+ ZIO_PRIORITY_ASYNC_READ);
+ if (!zn_has_cached_data(zp, offset, offset + rlen - 1)) {
+ zfs_exit(zfsvfs, FTAG);
+ return (error);
+ }
+ }
+
#ifdef HAVE_GENERIC_FADVISE
- if (zn_has_cached_data(zp, offset, offset + len - 1))
- error = generic_fadvise(filp, offset, len, advice);
+ error = generic_fadvise(filp, offset, len, advice);
#endif
- /*
- * Pass on the caller's size directly, but note that
- * dmu_prefetch_max will effectively cap it. If there
- * really is a larger sequential access pattern, perhaps
- * dmu_zfetch will detect it.
- */
- if (len == 0)
- len = i_size_read(ip) - offset;
- dmu_prefetch(os, zp->z_id, 0, offset, len,
- ZIO_PRIORITY_ASYNC_READ);
- break;
- case POSIX_FADV_NORMAL:
- case POSIX_FADV_RANDOM:
- case POSIX_FADV_DONTNEED:
- case POSIX_FADV_NOREUSE:
- /* ignored for now */
- break;
- default:
- error = -EINVAL;
- break;
+ if (error == 0 && advice == POSIX_FADV_DONTNEED) {
+ loff_t rlen = len ? len : i_size_read(ip) - offset;
+ dmu_evict_range(os, zp->z_id, offset, rlen);
}
zfs_exit(zfsvfs, FTAG);
@@ -1242,6 +1234,7 @@ const struct file_operations zpl_file_operations = {
.mmap = zpl_mmap,
.fsync = zpl_fsync,
.fallocate = zpl_fallocate,
+ .setlease = generic_setlease,
.copy_file_range = zpl_copy_file_range,
#ifdef HAVE_VFS_CLONE_FILE_RANGE
.clone_file_range = zpl_clone_file_range,
@@ -1264,6 +1257,7 @@ const struct file_operations zpl_dir_file_operations = {
.read = generic_read_dir,
.iterate_shared = zpl_iterate,
.fsync = zpl_fsync,
+ .setlease = generic_setlease,
.unlocked_ioctl = zpl_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = zpl_compat_ioctl,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
index f97662d052c7..e4e15c824f4b 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -506,6 +506,32 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
}
#endif
+#ifdef STATX_CHANGE_COOKIE
+ if (request_mask & STATX_CHANGE_COOKIE) {
+ /*
+ * knfsd uses the STATX_CHANGE_COOKIE to surface to clients
+ * change_info4 data, which is used to implement NFS client
+ * name caching (see RFC 8881 Section 10.8). This number
+ * should always increase with changes and should not be
+ * reused. We cannot simply present ctime here because
+ * ZFS uses a coarse timer to set them, which may cause
+ * clients to fail to detect changes and invalidate cache.
+ *
+ * ZFS always increments znode z_seq number, but this is
+ * uint_t and so we mask in ctime to upper bits.
+ *
+ * STATX_ATTR_CHANGE_MONOTONIC is advertised
+ * to prevent knfsd from generating the change cookie
+ * based on ctime. C.f. nfsd4_change_attribute in
+ * fs/nfsd/nfsfh.c.
+ */
+ stat->change_cookie =
+ ((u64)stat->ctime.tv_sec << 32) | zp->z_seq;
+ stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
+ stat->result_mask |= STATX_CHANGE_COOKIE;
+ }
+#endif
+
#ifdef STATX_DIOALIGN
if (request_mask & STATX_DIOALIGN) {
uint64_t align;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index 347b352506e5..2cd0f17c860f 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -24,6 +24,7 @@
* Copyright (c) 2023, Datto Inc. All rights reserved.
* Copyright (c) 2025, Klara, Inc.
* Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
+ * Copyright (c) 2026, TrueNAS.
*/
@@ -35,6 +36,8 @@
#include <linux/iversion.h>
#include <linux/version.h>
#include <linux/vfs_compat.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
/*
* What to do when the last reference to an inode is released. If 0, the kernel
@@ -265,21 +268,6 @@ zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
}
static int
-zpl_remount_fs(struct super_block *sb, int *flags, char *data)
-{
- zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
- fstrans_cookie_t cookie;
- int error;
-
- cookie = spl_fstrans_mark();
- error = -zfs_remount(sb, flags, &zm);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
{
int error;
@@ -354,21 +342,6 @@ zpl_show_options(struct seq_file *seq, struct dentry *root)
}
static int
-zpl_fill_super(struct super_block *sb, void *data, int silent)
-{
- zfs_mnt_t *zm = (zfs_mnt_t *)data;
- fstrans_cookie_t cookie;
- int error;
-
- cookie = spl_fstrans_mark();
- error = -zfs_domount(sb, zm, silent);
- spl_fstrans_unmark(cookie);
- ASSERT3S(error, <=, 0);
-
- return (error);
-}
-
-static int
zpl_test_super(struct super_block *s, void *data)
{
zfsvfs_t *zfsvfs = s->s_fs_info;
@@ -383,17 +356,477 @@ zpl_test_super(struct super_block *s, void *data)
return (zfsvfs != NULL && os == zfsvfs->z_os);
}
-static struct super_block *
-zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
+static void
+zpl_kill_sb(struct super_block *sb)
+{
+ zfs_preumount(sb);
+ kill_anon_super(sb);
+}
+
+void
+zpl_prune_sb(uint64_t nr_to_scan, void *arg)
+{
+ struct super_block *sb = (struct super_block *)arg;
+ int objects = 0;
+
+ /*
+ * Ensure the superblock is not in the process of being torn down.
+ */
+#ifdef HAVE_SB_DYING
+ if (down_read_trylock(&sb->s_umount)) {
+ if (!(sb->s_flags & SB_DYING) && sb->s_root &&
+ (sb->s_flags & SB_BORN)) {
+ (void) zfs_prune(sb, nr_to_scan, &objects);
+ }
+ up_read(&sb->s_umount);
+ }
+#else
+ if (down_read_trylock(&sb->s_umount)) {
+ if (!hlist_unhashed(&sb->s_instances) &&
+ sb->s_root && (sb->s_flags & SB_BORN)) {
+ (void) zfs_prune(sb, nr_to_scan, &objects);
+ }
+ up_read(&sb->s_umount);
+ }
+#endif
+}
+
+/*
+ * Mount option parsing.
+ *
+ * The kernel receives a set of "stringy" mount options, typically a
+ * comma-separated list through mount(2) or fsconfig(2). These are split into a
+ * set of struct fs_parameter, and then vfs_parse_fs_param() is called for
+ * each. That function will handle (and consume) some options directly, and
+ * other subsystems (mainly security modules) are given the opportunity to
+ * consume them too. Any left over are passed to zpl_parse_param(). Our job is
+ * to use them to fill in the vfs_t we've attached previously to
+ * fc->fs_private, ready for the mount or remount call when it comes.
+ *
+ * Historically, mount options have been generated, removed, modified and
+ * otherwise complicated by multiple different actors over a long time: the
+ * kernel itself, the original mount(8) utility and later libmount,
+ * mount.zfs(8), libzfs and the ZFS tools that use it, and any program using
+ * the various mount APIs that have come and gone over the years. This is
+ * further complicated by cross-pollination between OpenSolaris/illumos, Linux
+ * and FreeBSD. Long story short: we could see all sorts of things, and we need
+ * to at least try not to break old userspace programs.
+ *
+ * At time of writing, this is my best understanding of all the options we
+ * might reasonably see, and where and how they're handled.
+ *
+ *
+ * These are common options for all filesystems that are processed by the
+ * kernel directly, without zpl_parse_param() being called. They're a bit of a
+ * mixed bag, but are ultimately all available to us via either sb->s_flags or
+ * fc->sb_flags:
+ *
+ * dirsync: set SB_DIRSYNC
+ * lazytime: set SB_LAZYTIME
+ * mand: set SB_MANDLOCK
+ * ro: set SB_RDONLY
+ * sync: set SB_SYNCHRONOUS
+ *
+ * async: clear SB_SYNCHRONOUS
+ * nolazytime: clear SB_LAZYTIME
+ * nomand: clear SB_MANDLOCK
+ * rw: clear SB_RDONLY
+ *
+ * Fortunately, almost all of these are handled directly by the kernel. 'mand'
+ * and 'nomand' are swallowed by the kernel ('mand' emits a warning in the
+ * kernel log), but it and the corresponding dataset property have been a no-op
+ * in OpenZFS for years, so there's nothing for us to do there.
+ *
+ * The only tricky one is SB_RDONLY ('ro'/'rw'), which can be both a mount and
+ * a superblock option. While we won't receive the "stringy" options, the
+ * kernel will set it for us in fc->sb_flags, and we've always had special
+ * handling for it at mount and remount time (eg handling snapshot mounts), so
+ * it's not a problem to do nothing here because we will sort it out later.
+ *
+ *
+ * These are options that we may receive as "stringy" options but also as mount
+ * flags.
+ *
+ * exec: clear MS_NOEXEC
+ * noexec: set MS_NOEXEC
+ * suid: clear MS_NOSUID
+ * nosuid: set MS_NOSUID
+ * dev: clear MS_NODEV
+ * nodev: set MS_NODEV
+ * atime: clear MS_NOATIME
+ * noatime: set MS_NOATIME
+ * relatime: set MS_RELATIME
+ * norelatime: clear MS_RELATIME
+ *
+ * In testing, it appears that recent libmount will convert them, but our own
+ * mount code (libzfs_mount) may not. We will be called for the stringy
+ * versions, but not for the flags. The flags will later be available on
+ * vfsmount->mnt_flags, not set on the vfs_t. This tends not to matter in
+ * practice, as almost all mounts come through libzfs (via zfs-mount(8) or
+ * mount.zfs(8)) and so as strings, and when they do come through flags, they
+ * will still be reported correctly via mountinfo and by zfs-get(8), which has
+ * special handling for "temporary" properties. Also, we never use these
+ * internally for any decisions; 'exec', 'suid' and 'dev' are handled in the
+ * kernel, and the kernel provides helpers for 'atime' and 'relatime'. The
+ * only place the difference is observable is through zfs_get_temporary_prop(),
+ * which is only used by the zfs.get_prop() Lua call.
+ *
+ * This is fixable by getting at vfsmount->mnt_flags, but this is not readily
+ * available until after the mount operation is completed, and with some
+ * effort. This is all very low impact, so it's left for future improvement.
+ *
+ *
+ * These are true OpenZFS-specific mount options. They give the equivalent
+ * of temporarily setting the pool properties as follows:
+ *
+ * strictatime atime=on, relatime=off
+ *
+ * xattr: xattr=sa
+ * saxattr: xattr=sa
+ * dirxattr: xattr=dir
+ * noxattr: xattr=off
+ *
+ *
+ * mntpoint= provides the canonical mount point for a snapshot mount. This
+ * is an assist for the snapshot automounter call out to userspace, to
+ * understand where the snapshot is mounted even when triggered from an
+ * alternate mount namespace (eg inside a chroot).
+ *
+ * mntpoint= vfs->vfs_mntpoint=...
+ *
+ *
+ * These are used for coordination inside libzfs, and should not make it
+ * to the kernel, but it does not strip them, so we handle them and ignore
+ * them.
+ *
+ * defaults
+ * zfsutil
+ * remount
+ *
+ *
+ * These are specific to SELinux. When that security module is running, it
+ * will consume them, but if not, they will be passed through to us. libzfs
+ * adds them unconditionally, so we will always see them when SELinux is not
+ * running, and ignore them.
+ *
+ * fscontext
+ * defcontext
+ * rootcontext
+ * context
+ *
+ *
+ * When preparing a remount, libmount will read /proc/self/mountinfo and add
+ * any unrecognised flags it finds there to the options. So, we have to accept
+ * anything that __zpl_show_options() can produce.
+ *
+ * posixacl
+ * noacl
+ * casesensitive
+ * caseinsensitive
+ * casemixed
+ *
+ *
+ * mount(8) has a notion of "sloppy" options. According to the documentation,
+ * when the -s switch is provided, unrecognised mount options will be ignored.
+ * Only the Linux NFS and SMB filesystems support it, and traditionally
+ * OpenZFS has too. however, it appears massively underspecified and
+ * inconsistent. Depending on the interplay between mount(8), the mount helper
+ * (eg mount.zfs(8)) and libmount, -s may cause unknown options to be filtered
+ * in userspace, _or_ an additional option 'sloppy' to be passed to the kernel
+ * either before or after the "unknown" option, _or_ nothing at all happens
+ * and the unknown option to be passed through to the kernel as-is. The
+ * kernel NFS and SMB filesystems both expect to see an explicit option
+ * 'sloppy' and use this to either ignore or reject unknown options, but as
+ * described, it's very easy for that option to not appear, or appear too late.
+ *
+ * OpenZFS has a test for this in the test suite, and it's documented in
+ * mount.zfs(8), so to support it we accept 'sloppy' and ignore it, and all
+ * other unknown options produce a notice in the kernel log, and are also
+ * ignored. This allows the "feature" to continue to work, while avoiding
+ * the additional housekeeping for the 'sloppy' option.
+ *
+ * sloppy
+ *
+ *
+ * Finally, all filesystems get automatic handling for the 'source' option,
+ * that is, the "name" of the filesystem (the first column of df(1)'s output).
+ * However, this only happens if the handler does not otherwise handle
+ * the 'source' option. Since we handle _all_ options because of 'sloppy', we
+ * deal with this explicitly by calling into the kernel's helper for this,
+ * vfs_parse_fs_param_source(), which sets up fc->source.
+ *
+ * source
+ *
+ *
+ * Thank you for reading this far. I hope you find what you are looking for,
+ * in this life or the next.
+ *
+ * -- robn, 2026-03-26
+ */
+
+enum {
+ Opt_exec, Opt_suid, Opt_dev,
+ Opt_atime, Opt_relatime, Opt_strictatime,
+ Opt_saxattr, Opt_dirxattr, Opt_noxattr,
+ Opt_mntpoint,
+
+ Opt_ignore, Opt_warn,
+};
+
+static const struct fs_parameter_spec zpl_param_spec[] = {
+ fsparam_flag_no("exec", Opt_exec),
+ fsparam_flag_no("suid", Opt_suid),
+ fsparam_flag_no("dev", Opt_dev),
+
+ fsparam_flag_no("atime", Opt_atime),
+ fsparam_flag_no("relatime", Opt_relatime),
+ fsparam_flag("strictatime", Opt_strictatime),
+
+ fsparam_flag("xattr", Opt_saxattr),
+ fsparam_flag("saxattr", Opt_saxattr),
+ fsparam_flag("dirxattr", Opt_dirxattr),
+ fsparam_flag("noxattr", Opt_noxattr),
+
+ fsparam_string("mntpoint", Opt_mntpoint),
+
+ fsparam_flag("defaults", Opt_ignore),
+ fsparam_flag("zfsutil", Opt_ignore),
+ fsparam_flag("remount", Opt_ignore),
+
+ fsparam_string("fscontext", Opt_ignore),
+ fsparam_string("defcontext", Opt_ignore),
+ fsparam_string("rootcontext", Opt_ignore),
+ fsparam_string("context", Opt_ignore),
+
+ fsparam_flag("posixacl", Opt_ignore),
+ fsparam_flag("noacl", Opt_ignore),
+ fsparam_flag("casesensitive", Opt_ignore),
+ fsparam_flag("caseinsensitive", Opt_ignore),
+ fsparam_flag("casemixed", Opt_ignore),
+
+ fsparam_flag("sloppy", Opt_ignore),
+
+ {}
+};
+
+static int
+zpl_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct super_block *s;
+ vfs_t *vfs = fc->fs_private;
+
+ /* Handle 'source' explicitly so we don't trip on it as an unknown. */
+ int opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return (opt);
+
+ struct fs_parse_result result;
+ opt = fs_parse(fc, zpl_param_spec, param, &result);
+ if (opt == -ENOPARAM) {
+ /*
+ * Convert unknowns to warnings, to work around the whole
+ * "sloppy option" mess.
+ */
+ opt = Opt_warn;
+ }
+ if (opt < 0)
+ return (opt);
+
+ switch (opt) {
+ case Opt_exec:
+ vfs->vfs_exec = !result.negated;
+ vfs->vfs_do_exec = B_TRUE;
+ break;
+ case Opt_suid:
+ vfs->vfs_setuid = !result.negated;
+ vfs->vfs_do_setuid = B_TRUE;
+ break;
+ case Opt_dev:
+ vfs->vfs_devices = !result.negated;
+ vfs->vfs_do_devices = B_TRUE;
+ break;
+
+ case Opt_atime:
+ vfs->vfs_atime = !result.negated;
+ vfs->vfs_do_atime = B_TRUE;
+ break;
+ case Opt_relatime:
+ vfs->vfs_relatime = !result.negated;
+ vfs->vfs_do_relatime = B_TRUE;
+ break;
+ case Opt_strictatime:
+ vfs->vfs_atime = B_TRUE;
+ vfs->vfs_do_atime = B_TRUE;
+ vfs->vfs_relatime = B_FALSE;
+ vfs->vfs_do_relatime = B_TRUE;
+ break;
+
+ case Opt_saxattr:
+ vfs->vfs_xattr = ZFS_XATTR_SA;
+ vfs->vfs_do_xattr = B_TRUE;
+ break;
+ case Opt_dirxattr:
+ vfs->vfs_xattr = ZFS_XATTR_DIR;
+ vfs->vfs_do_xattr = B_TRUE;
+ break;
+ case Opt_noxattr:
+ vfs->vfs_xattr = ZFS_XATTR_OFF;
+ vfs->vfs_do_xattr = B_TRUE;
+ break;
+
+ case Opt_mntpoint:
+ if (vfs->vfs_mntpoint != NULL)
+ kmem_strfree(vfs->vfs_mntpoint);
+ vfs->vfs_mntpoint = kmem_strdup(param->string);
+ break;
+
+ case Opt_ignore:
+ break;
+
+ case Opt_warn:
+ cmn_err(CE_NOTE,
+ "ZFS: ignoring unknown mount option: %s", param->key);
+ break;
+
+ default:
+ return (-SET_ERROR(EINVAL));
+ }
+
+ return (0);
+}
+
+/*
+ * Before Linux 5.8, the kernel's individual parameter parsing had a list of
+ * "forbidden" options that would always be rejected early. These were options
+ * that should be specified by MS_* flags, to be set on the superblock
+ * directly. However, it was inconsistently applied (eg it had various "*atime"
+ * options but not "atime", and also caused problems when it was not in sync
+ * with the version of libmount in use. It was deemed needlessly restrictive
+ * and was dropped in torvalds/linux@9193ae87a8af.
+ *
+ * Unfortunately, some of the options on this list are used by OpenZFS, so
+ * we need to see them. These include the aforementioned "*atime", "dev",
+ * "exec" and "suid".
+ *
+ * There is no easy compile-time check available to detect this, so we use
+ * a simple version check that should make it available everywhere needed,
+ * most notably RHEL8's 4.18+extras, which has backported fs_context support
+ * but does not include the 5.8 commit.
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0)
+#define HAVE_FORBIDDEN_SB_FLAGS 1
+#endif
+
+#ifdef HAVE_FORBIDDEN_SB_FLAGS
+/*
+ * The typical path for options parsing through mount(2) is:
+ *
+ * ksys_mount
+ * do_mount
+ * generic_parse_monolithic
+ * vfs_parse_fs_string
+ * vfs_parse_fs_param
+ * zpl_parse_param
+ *
+ * vfs_parse_fs_param() calls the internal vfs_parse_sb_flag(), which is
+ * where the "forbidden" flags are applied. If it makes it through there,
+ * it will later call fc->parse_param() ie zpl_parse_param(). We can't
+ * intercept this chain in the middle anywhere; the earliest thing we can
+ * override is generic_parse_monolithic(), substituting our own by setting
+ * fc->parse_monolithic and doing the parsing work ourselves.
+ *
+ * Fortunately, generic_parse_monolithic() is almost entirely splitting the
+ * incoming parameter string on comma and handing off to the rest of the
+ * pipeline. This is easily replaced (almost entirely by reviving a few bits
+ * of our old options parser).
+ *
+ * To keep the change as narrow as possible, we reuse zpl_param_spec and
+ * zpl_parse_param() as much as possible. Once we've parsed the option, we call
+ * fs_parse(zpl_param_spec) to find out if the option is actually one we
+ * explicitly care about. If it is, we call zpl_parse_param() directly,
+ * avoiding vfs_parse_fs_param() and so the risk of being rejected. If it is
+ * not one we explicitly care about, we call zpl_parse_param() as normal,
+ * letting the kernel reject it if it wishes. If it doesn't, it will end up
+ * back in zpl_parse_param() via fc->parse_param, and we can ignore or warn
+ * about it we normally would.
+ */
+static int
+zpl_parse_monolithic(struct fs_context *fc, void *data)
+{
+ char *mntopts = data;
+
+ if (mntopts == NULL)
+ return (0);
+
+ /*
+ * Because we supply a .parse_monolithic callback, the kernel does
+ * no consideration of the options blob at all. Because of this, we
+ * have to give LSMs a first look at it. They will remove any options
+ * of interest to them (eg the SELinux *context= options).
+ */
+ int err = security_sb_eat_lsm_opts(mntopts, &fc->security);
+ if (err)
+ return (err);
+
+ char *key;
+ while ((key = strsep(&mntopts, ",")) != NULL) {
+ if (!*key)
+ continue;
+
+ struct fs_parameter param = {
+ .key = key,
+ };
+
+ char *value = strchr(key, '=');
+ if (value != NULL) {
+ /* Key starts with '='. Kernel ignores, we will too. */
+ if (value == key)
+ continue;
+ *value++ = '\0';
+
+ /* key=value is a "string" type, set up for that */
+ param.string = value;
+ param.type = fs_value_is_string;
+ param.size = strlen(value);
+ } else {
+ /* unadorned key is a "flag" type */
+ param.type = fs_value_is_flag;
+ }
+
+ /* Check if this is one of our options. */
+ struct fs_parse_result result;
+ int opt = fs_parse(fc, zpl_param_spec, &param, &result);
+ if (opt >= 0) {
+ /*
+ * We already know this one of our options, so a
+ * failure here would be nonsensical.
+ */
+ VERIFY0(zpl_parse_param(fc, &param));
+ } else {
+ /*
+ * Not one of our option, send it through the kernel's
+ * standard parameter handling.
+ */
+ err = vfs_parse_fs_param(fc, &param);
+ if (err < 0)
+ return (err);
+ }
+ }
+
+ return (0);
+}
+#endif /* HAVE_FORBIDDEN_SB_FLAGS */
+
+static int
+zpl_get_tree(struct fs_context *fc)
+{
+ struct super_block *sb;
objset_t *os;
boolean_t issnap = B_FALSE;
int err;
- err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
+ err = dmu_objset_hold(fc->source, FTAG, &os);
if (err)
- return (ERR_PTR(-err));
+ return (-err);
/*
* The dsl pool lock must be released prior to calling sget().
@@ -405,7 +838,8 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
dsl_pool_rele(dmu_objset_pool(os), FTAG);
- s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+ sb = sget(fc->fs_type, zpl_test_super, set_anon_super,
+ fc->sb_flags, os);
/*
* Recheck with the lock held to prevent mounting the wrong dataset
@@ -415,93 +849,161 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
* also s_umount lock is not held there so it would race with
* zfs_umount and zfsvfs can be freed.
*/
- if (!IS_ERR(s) && s->s_fs_info != NULL) {
- zfsvfs_t *zfsvfs = s->s_fs_info;
+ if (!IS_ERR(sb) && sb->s_fs_info != NULL) {
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
if (zpl_enter(zfsvfs, FTAG) == 0) {
if (os != zfsvfs->z_os)
- err = -SET_ERROR(EBUSY);
+ err = SET_ERROR(EBUSY);
issnap = zfsvfs->z_issnap;
zpl_exit(zfsvfs, FTAG);
} else {
- err = -SET_ERROR(EBUSY);
+ err = SET_ERROR(EBUSY);
}
}
dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
dsl_dataset_rele(dmu_objset_ds(os), FTAG);
- if (IS_ERR(s))
- return (ERR_CAST(s));
+ if (IS_ERR(sb))
+ return (PTR_ERR(sb));
if (err) {
- deactivate_locked_super(s);
- return (ERR_PTR(err));
+ deactivate_locked_super(sb);
+ return (-err);
}
- if (s->s_root == NULL) {
- err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
+ if (sb->s_root == NULL) {
+ vfs_t *vfs = fc->fs_private;
+
+ /* Apply readonly flag as mount option */
+ if (fc->sb_flags & SB_RDONLY) {
+ vfs->vfs_readonly = B_TRUE;
+ vfs->vfs_do_readonly = B_TRUE;
+ }
+
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ err = zfs_domount(sb, fc->source, vfs,
+ fc->sb_flags & SB_SILENT ? 1 : 0);
+ spl_fstrans_unmark(cookie);
+
if (err) {
- deactivate_locked_super(s);
- return (ERR_PTR(err));
+ deactivate_locked_super(sb);
+ return (-err);
}
- s->s_flags |= SB_ACTIVE;
- } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
+
+ /*
+ * zfsvfs has taken ownership of the mount options, so we
+ * need to ensure we don't free them.
+ */
+ fc->fs_private = NULL;
+
+ sb->s_flags |= SB_ACTIVE;
+ } else if (!issnap && ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)) {
/*
* Skip ro check for snap since snap is always ro regardless
* ro flag is passed by mount or not.
*/
- deactivate_locked_super(s);
- return (ERR_PTR(-EBUSY));
+ deactivate_locked_super(sb);
+ return (-SET_ERROR(EBUSY));
}
- return (s);
+ struct dentry *root = dget(sb->s_root);
+ if (IS_ERR(root))
+ return (PTR_ERR(root));
+
+ fc->root = root;
+ return (0);
}
-static struct dentry *
-zpl_mount(struct file_system_type *fs_type, int flags,
- const char *osname, void *data)
+static int
+zpl_reconfigure(struct fs_context *fc)
{
- zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+ fstrans_cookie_t cookie;
+ int error;
- struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
- if (IS_ERR(sb))
- return (ERR_CAST(sb));
+ cookie = spl_fstrans_mark();
+ error = -zfs_remount(fc->root->d_sb, fc->fs_private, fc->sb_flags);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
- return (dget(sb->s_root));
-}
+ if (error == 0) {
+ /*
+ * zfsvfs has taken ownership of the mount options, so we
+ * need to ensure we don't free them.
+ */
+ fc->fs_private = NULL;
+ }
-static void
-zpl_kill_sb(struct super_block *sb)
-{
- zfs_preumount(sb);
- kill_anon_super(sb);
+ return (error);
}
-void
-zpl_prune_sb(uint64_t nr_to_scan, void *arg)
+static int
+zpl_dup_fc(struct fs_context *fc, struct fs_context *src_fc)
{
- struct super_block *sb = (struct super_block *)arg;
- int objects = 0;
+ vfs_t *src_vfs = src_fc->fs_private;
+ if (src_vfs == NULL)
+ return (0);
+
+ vfs_t *vfs = zfsvfs_vfs_alloc();
+ if (vfs == NULL)
+ return (-SET_ERROR(ENOMEM));
/*
- * Ensure the superblock is not in the process of being torn down.
+ * This is annoying, but a straight memcpy() would require us to
+ * reinitialise the lock.
*/
-#ifdef HAVE_SB_DYING
- if (down_read_trylock(&sb->s_umount)) {
- if (!(sb->s_flags & SB_DYING) && sb->s_root &&
- (sb->s_flags & SB_BORN)) {
- (void) zfs_prune(sb, nr_to_scan, &objects);
- }
- up_read(&sb->s_umount);
- }
-#else
- if (down_read_trylock(&sb->s_umount)) {
- if (!hlist_unhashed(&sb->s_instances) &&
- sb->s_root && (sb->s_flags & SB_BORN)) {
- (void) zfs_prune(sb, nr_to_scan, &objects);
- }
- up_read(&sb->s_umount);
- }
+ vfs->vfs_xattr = src_vfs->vfs_xattr;
+ vfs->vfs_readonly = src_vfs->vfs_readonly;
+ vfs->vfs_do_readonly = src_vfs->vfs_do_readonly;
+ vfs->vfs_setuid = src_vfs->vfs_setuid;
+ vfs->vfs_do_setuid = src_vfs->vfs_do_setuid;
+ vfs->vfs_exec = src_vfs->vfs_exec;
+ vfs->vfs_do_exec = src_vfs->vfs_do_exec;
+ vfs->vfs_devices = src_vfs->vfs_devices;
+ vfs->vfs_do_devices = src_vfs->vfs_do_devices;
+ vfs->vfs_do_xattr = src_vfs->vfs_do_xattr;
+ vfs->vfs_atime = src_vfs->vfs_atime;
+ vfs->vfs_do_atime = src_vfs->vfs_do_atime;
+ vfs->vfs_relatime = src_vfs->vfs_relatime;
+ vfs->vfs_do_relatime = src_vfs->vfs_do_relatime;
+ vfs->vfs_nbmand = src_vfs->vfs_nbmand;
+ vfs->vfs_do_nbmand = src_vfs->vfs_do_nbmand;
+
+ mutex_enter(&src_vfs->vfs_mntpt_lock);
+ if (src_vfs->vfs_mntpoint != NULL)
+ vfs->vfs_mntpoint = kmem_strdup(src_vfs->vfs_mntpoint);
+ mutex_exit(&src_vfs->vfs_mntpt_lock);
+
+ fc->fs_private = vfs;
+ return (0);
+}
+
+static void
+zpl_free_fc(struct fs_context *fc)
+{
+ zfsvfs_vfs_free(fc->fs_private);
+}
+
+const struct fs_context_operations zpl_fs_context_operations = {
+#ifdef HAVE_FORBIDDEN_SB_FLAGS
+ .parse_monolithic = zpl_parse_monolithic,
#endif
+ .parse_param = zpl_parse_param,
+ .get_tree = zpl_get_tree,
+ .reconfigure = zpl_reconfigure,
+ .dup = zpl_dup_fc,
+ .free = zpl_free_fc,
+};
+
+static int
+zpl_init_fs_context(struct fs_context *fc)
+{
+ fc->fs_private = zfsvfs_vfs_alloc();
+ if (fc->fs_private == NULL)
+ return (-SET_ERROR(ENOMEM));
+
+ fc->ops = &zpl_fs_context_operations;
+
+ return (0);
}
const struct super_operations zpl_super_operations = {
@@ -517,7 +1019,6 @@ const struct super_operations zpl_super_operations = {
.put_super = zpl_put_super,
.sync_fs = zpl_sync_fs,
.statfs = zpl_statfs,
- .remount_fs = zpl_remount_fs,
.show_devname = zpl_show_devname,
.show_options = zpl_show_options,
.show_stats = NULL,
@@ -560,7 +1061,7 @@ struct file_system_type zpl_fs_type = {
#else
.fs_flags = FS_USERNS_MOUNT,
#endif
- .mount = zpl_mount,
+ .init_fs_context = zpl_init_fs_context,
.kill_sb = zpl_kill_sb,
};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 89f9bc555fcf..dc47ff20fd74 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -1796,7 +1796,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
{
int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
- ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));