diff options
Diffstat (limited to 'sys/contrib/openzfs/module/os/linux')
20 files changed, 1554 insertions, 704 deletions
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c deleted file mode 100644 index b6d967108fed..000000000000 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <behlendorf1@llnl.gov>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Atomic Implementation. - */ - -#include <sys/atomic.h> - -#ifdef ATOMIC_SPINLOCK -/* Global atomic lock declarations */ -DEFINE_SPINLOCK(atomic32_lock); -DEFINE_SPINLOCK(atomic64_lock); - -EXPORT_SYMBOL(atomic32_lock); -EXPORT_SYMBOL(atomic64_lock); -#endif /* ATOMIC_SPINLOCK */ diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index 89ca4a648b2f..585ad7377b49 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -197,266 +197,8 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) return (0); } - - EXPORT_SYMBOL(random_get_pseudo_bytes); -#if BITS_PER_LONG == 32 - -/* - * Support 64/64 => 64 division on a 32-bit platform. While the kernel - * provides a div64_u64() function for this we do not use it because the - * implementation is flawed. There are cases which return incorrect - * results as late as linux-2.6.35. Until this is fixed upstream the - * spl must provide its own implementation. - * - * This implementation is a slightly modified version of the algorithm - * proposed by the book 'Hacker's Delight'. The original source can be - * found here and is available for use without restriction. - * - * http://www.hackersdelight.org/HDcode/newCode/divDouble.c - */ - -/* - * Calculate number of leading of zeros for a 64-bit value. - */ -static int -nlz64(uint64_t x) -{ - register int n = 0; - - if (x == 0) - return (64); - - if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } - if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } - if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } - if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } - if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } - if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } - - return (n); -} - -/* - * Newer kernels have a div_u64() function but we define our own - * to simplify portability between kernel versions. - */ -static inline uint64_t -__div_u64(uint64_t u, uint32_t v) -{ - (void) do_div(u, v); - return (u); -} - -/* - * Turn off missing prototypes warning for these functions. They are - * replacements for libgcc-provided functions and will never be called - * directly. - */ -#if defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmissing-prototypes" -#endif - -/* - * Implementation of 64-bit unsigned division for 32-bit machines. - * - * First the procedure takes care of the case in which the divisor is a - * 32-bit quantity. There are two subcases: (1) If the left half of the - * dividend is less than the divisor, one execution of do_div() is all that - * is required (overflow is not possible). (2) Otherwise it does two - * divisions, using the grade school method. - */ -uint64_t -__udivdi3(uint64_t u, uint64_t v) -{ - uint64_t u0, u1, v1, q0, q1, k; - int n; - - if (v >> 32 == 0) { // If v < 2**32: - if (u >> 32 < v) { // If u/v cannot overflow, - return (__div_u64(u, v)); // just do one division. - } else { // If u/v would overflow: - u1 = u >> 32; // Break u into two halves. - u0 = u & 0xFFFFFFFF; - q1 = __div_u64(u1, v); // First quotient digit. - k = u1 - q1 * v; // First remainder, < v. - u0 += (k << 32); - q0 = __div_u64(u0, v); // Seconds quotient digit. - return ((q1 << 32) + q0); - } - } else { // If v >= 2**32: - n = nlz64(v); // 0 <= n <= 31. - v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. - u1 = u >> 1; // To ensure no overflow. - q1 = __div_u64(u1, v1); // Get quotient from - q0 = (q1 << n) >> 31; // Undo normalization and - // division of u by 2. - if (q0 != 0) // Make q0 correct or - q0 = q0 - 1; // too small by 1. - if ((u - q0 * v) >= v) - q0 = q0 + 1; // Now q0 is correct. - - return (q0); - } -} -EXPORT_SYMBOL(__udivdi3); - -#ifndef abs64 -/* CSTYLED */ -#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) -#endif - -/* - * Implementation of 64-bit signed division for 32-bit machines. - */ -int64_t -__divdi3(int64_t u, int64_t v) -{ - int64_t q, t; - q = __udivdi3(abs64(u), abs64(v)); - t = (u ^ v) >> 63; // If u, v have different - return ((q ^ t) - t); // signs, negate q. -} -EXPORT_SYMBOL(__divdi3); - -/* - * Implementation of 64-bit unsigned modulo for 32-bit machines. - */ -uint64_t -__umoddi3(uint64_t dividend, uint64_t divisor) -{ - return (dividend - (divisor * __udivdi3(dividend, divisor))); -} -EXPORT_SYMBOL(__umoddi3); - -/* 64-bit signed modulo for 32-bit machines. */ -int64_t -__moddi3(int64_t n, int64_t d) -{ - int64_t q; - boolean_t nn = B_FALSE; - - if (n < 0) { - nn = B_TRUE; - n = -n; - } - if (d < 0) - d = -d; - - q = __umoddi3(n, d); - - return (nn ? -q : q); -} -EXPORT_SYMBOL(__moddi3); - -/* - * Implementation of 64-bit unsigned division/modulo for 32-bit machines. - */ -uint64_t -__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) -{ - uint64_t q = __udivdi3(n, d); - if (r) - *r = n - d * q; - return (q); -} -EXPORT_SYMBOL(__udivmoddi4); - -/* - * Implementation of 64-bit signed division/modulo for 32-bit machines. - */ -int64_t -__divmoddi4(int64_t n, int64_t d, int64_t *r) -{ - int64_t q, rr; - boolean_t nn = B_FALSE; - boolean_t nd = B_FALSE; - if (n < 0) { - nn = B_TRUE; - n = -n; - } - if (d < 0) { - nd = B_TRUE; - d = -d; - } - - q = __udivmoddi4(n, d, (uint64_t *)&rr); - - if (nn != nd) - q = -q; - if (nn) - rr = -rr; - if (r) - *r = rr; - return (q); -} -EXPORT_SYMBOL(__divmoddi4); - -#if defined(__arm) || defined(__arm__) -/* - * Implementation of 64-bit (un)signed division for 32-bit arm machines. - * - * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) - * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, - * and the remainder in {r2, r3}. The return type is specifically left - * set to 'void' to ensure the compiler does not overwrite these registers - * during the return. All results are in registers as per ABI - */ -void -__aeabi_uldivmod(uint64_t u, uint64_t v) -{ - uint64_t res; - uint64_t mod; - - res = __udivdi3(u, v); - mod = __umoddi3(u, v); - { - register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); - register uint32_t r1 asm("r1") = (res >> 32); - register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); - register uint32_t r3 asm("r3") = (mod >> 32); - - asm volatile("" - : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */ - : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ - - return; /* r0; */ - } -} -EXPORT_SYMBOL(__aeabi_uldivmod); - -void -__aeabi_ldivmod(int64_t u, int64_t v) -{ - int64_t res; - uint64_t mod; - - res = __divdi3(u, v); - mod = __umoddi3(u, v); - { - register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); - register uint32_t r1 asm("r1") = (res >> 32); - register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); - register uint32_t r3 asm("r3") = (mod >> 32); - - asm volatile("" - : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */ - : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ - - return; /* r0; */ - } -} -EXPORT_SYMBOL(__aeabi_ldivmod); -#endif /* __arm || __arm__ */ - -#if defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic pop -#endif - -#endif /* BITS_PER_LONG */ - /* * NOTE: The strtoxx behavior is solely based on my reading of the Solaris * ddi_strtol(9F) man page. I have not verified the behavior of these diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c index 5594b2f80c02..6d496e68511e 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -139,12 +139,10 @@ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { - gfp_t lflags = kmem_flags_convert(flags); + gfp_t lflags = kmem_flags_convert(flags | KM_VMEM); void *ptr; - if (skc->skc_flags & KMC_RECLAIMABLE) - lflags |= __GFP_RECLAIMABLE; - ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); + ptr = spl_vmalloc(size, lflags); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); @@ -424,7 +422,7 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) if (!empty) return (-EEXIST); - if (skc->skc_flags & KMC_RECLAIMABLE) + if (skc->skc_flags & KMC_RECLAIMABLE && !(flags & KM_VMEM)) lflags |= __GFP_RECLAIMABLE; ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c index 9fe008cef868..9fe4042b5079 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c @@ -188,7 +188,7 @@ spl_kvmalloc(size_t size, gfp_t lflags) return (ptr); } - return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); + return (spl_vmalloc(size, lflags)); } /* @@ -237,7 +237,7 @@ spl_kmem_alloc_impl(size_t size, int flags, int node) */ if (size > spl_kmem_alloc_max) { if (flags & KM_VMEM) { - ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); + ptr = spl_vmalloc(size, lflags); } else { return (NULL); } diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c index 02c5b42bc4a0..154ab12e84f7 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c @@ -531,7 +531,6 @@ kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, strlcpy(kpep->kpe_module, module, sizeof (kpep->kpe_module)); strlcpy(kpep->kpe_name, name, sizeof (kpep->kpe_name)); } -EXPORT_SYMBOL(kstat_proc_entry_init); kstat_t * __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, @@ -702,7 +701,6 @@ out: mutex_exit(&kstat_module_lock); } -EXPORT_SYMBOL(kstat_proc_entry_install); void __kstat_install(kstat_t *ksp) @@ -739,7 +737,6 @@ kstat_proc_entry_delete(kstat_proc_entry_t *kpep) mutex_exit(&kstat_module_lock); } -EXPORT_SYMBOL(kstat_proc_entry_delete); void __kstat_delete(kstat_t *ksp) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-math-compat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-math-compat.c new file mode 100644 index 000000000000..3184db7f28b0 --- /dev/null +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-math-compat.c @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <behlendorf1@llnl.gov>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Generic Implementation. + */ + +#include <sys/isa_defs.h> +#include <sys/sysmacros.h> + +/* + * 64-bit math support for 32-bit platforms. Compilers will generatee + * references to the functions here if required. + */ + +#if BITS_PER_LONG == 32 + +/* + * Support 64/64 => 64 division on a 32-bit platform. While the kernel + * provides a div64_u64() function for this we do not use it because the + * implementation is flawed. There are cases which return incorrect + * results as late as linux-2.6.35. Until this is fixed upstream the + * spl must provide its own implementation. + * + * This implementation is a slightly modified version of the algorithm + * proposed by the book 'Hacker's Delight'. The original source can be + * found here and is available for use without restriction. + * + * http://www.hackersdelight.org/HDcode/newCode/divDouble.c + */ + +/* + * Calculate number of leading of zeros for a 64-bit value. + */ +static int +nlz64(uint64_t x) +{ + register int n = 0; + + if (x == 0) + return (64); + + if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } + if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } + if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } + if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } + if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } + if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } + + return (n); +} + +/* + * Newer kernels have a div_u64() function but we define our own + * to simplify portability between kernel versions. + */ +static inline uint64_t +__div_u64(uint64_t u, uint32_t v) +{ + (void) do_div(u, v); + return (u); +} + +/* + * Implementation of 64-bit unsigned division for 32-bit machines. + * + * First the procedure takes care of the case in which the divisor is a + * 32-bit quantity. There are two subcases: (1) If the left half of the + * dividend is less than the divisor, one execution of do_div() is all that + * is required (overflow is not possible). (2) Otherwise it does two + * divisions, using the grade school method. + */ +uint64_t +__udivdi3(uint64_t u, uint64_t v) +{ + uint64_t u0, u1, v1, q0, q1, k; + int n; + + if (v >> 32 == 0) { // If v < 2**32: + if (u >> 32 < v) { // If u/v cannot overflow, + return (__div_u64(u, v)); // just do one division. + } else { // If u/v would overflow: + u1 = u >> 32; // Break u into two halves. + u0 = u & 0xFFFFFFFF; + q1 = __div_u64(u1, v); // First quotient digit. + k = u1 - q1 * v; // First remainder, < v. + u0 += (k << 32); + q0 = __div_u64(u0, v); // Seconds quotient digit. + return ((q1 << 32) + q0); + } + } else { // If v >= 2**32: + n = nlz64(v); // 0 <= n <= 31. + v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. + u1 = u >> 1; // To ensure no overflow. + q1 = __div_u64(u1, v1); // Get quotient from + q0 = (q1 << n) >> 31; // Undo normalization and + // division of u by 2. + if (q0 != 0) // Make q0 correct or + q0 = q0 - 1; // too small by 1. + if ((u - q0 * v) >= v) + q0 = q0 + 1; // Now q0 is correct. + + return (q0); + } +} +EXPORT_SYMBOL(__udivdi3); + +#ifndef abs64 +/* CSTYLED */ +#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) +#endif + +/* + * Implementation of 64-bit signed division for 32-bit machines. + */ +int64_t +__divdi3(int64_t u, int64_t v) +{ + int64_t q, t; + q = __udivdi3(abs64(u), abs64(v)); + t = (u ^ v) >> 63; // If u, v have different + return ((q ^ t) - t); // signs, negate q. +} +EXPORT_SYMBOL(__divdi3); + +/* + * Implementation of 64-bit unsigned modulo for 32-bit machines. + */ +uint64_t +__umoddi3(uint64_t dividend, uint64_t divisor) +{ + return (dividend - (divisor * __udivdi3(dividend, divisor))); +} +EXPORT_SYMBOL(__umoddi3); + +/* 64-bit signed modulo for 32-bit machines. */ +int64_t +__moddi3(int64_t n, int64_t d) +{ + int64_t q; + boolean_t nn = B_FALSE; + + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) + d = -d; + + q = __umoddi3(n, d); + + return (nn ? -q : q); +} +EXPORT_SYMBOL(__moddi3); + +/* + * Implementation of 64-bit unsigned division/modulo for 32-bit machines. + */ +uint64_t +__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) +{ + uint64_t q = __udivdi3(n, d); + if (r) + *r = n - d * q; + return (q); +} +EXPORT_SYMBOL(__udivmoddi4); + +/* + * Implementation of 64-bit signed division/modulo for 32-bit machines. + */ +int64_t +__divmoddi4(int64_t n, int64_t d, int64_t *r) +{ + int64_t q, rr; + boolean_t nn = B_FALSE; + boolean_t nd = B_FALSE; + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) { + nd = B_TRUE; + d = -d; + } + + q = __udivmoddi4(n, d, (uint64_t *)&rr); + + if (nn != nd) + q = -q; + if (nn) + rr = -rr; + if (r) + *r = rr; + return (q); +} +EXPORT_SYMBOL(__divmoddi4); + +#if defined(__arm) || defined(__arm__) +/* + * Implementation of 64-bit (un)signed division for 32-bit arm machines. + * + * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) + * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, + * and the remainder in {r2, r3}. The return type is specifically left + * set to 'void' to ensure the compiler does not overwrite these registers + * during the return. All results are in registers as per ABI + */ +void +__aeabi_uldivmod(uint64_t u, uint64_t v) +{ + uint64_t res; + uint64_t mod; + + res = __udivdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_uldivmod); + +void +__aeabi_ldivmod(int64_t u, int64_t v) +{ + int64_t res; + uint64_t mod; + + res = __divdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_ldivmod); +#endif /* __arm || __arm__ */ + +#endif /* BITS_PER_LONG */ diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c index 1c984f221c7d..76ee71074cb5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c @@ -27,8 +27,6 @@ #include <sys/taskq.h> -#ifdef _KERNEL #define CREATE_TRACE_POINTS #include <sys/trace.h> #include <sys/trace_taskq.h> -#endif diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c index b2eae5d00b10..5992957280e4 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c @@ -59,6 +59,18 @@ typedef struct zone_dataset { char zd_dsname[]; /* name of the member dataset */ } zone_dataset_t; +/* + * UID-based dataset zoning: allows delegating datasets to all user + * namespaces owned by a specific UID, enabling rootless container support. + */ +typedef struct zone_uid_datasets { + struct list_head zuds_list; /* zone_uid_datasets linkage */ + kuid_t zuds_owner; /* owner UID */ + struct list_head zuds_datasets; /* datasets for this UID */ +} zone_uid_datasets_t; + +static struct list_head zone_uid_datasets; + #ifdef CONFIG_USER_NS /* @@ -138,6 +150,18 @@ zone_datasets_lookup(unsigned int nsinum) } #ifdef CONFIG_USER_NS +static zone_uid_datasets_t * +zone_uid_datasets_lookup(kuid_t owner) +{ + zone_uid_datasets_t *zuds; + + list_for_each_entry(zuds, &zone_uid_datasets, zuds_list) { + if (uid_eq(zuds->zuds_owner, owner)) + return (zuds); + } + return (NULL); +} + static struct zone_dataset * zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen) { @@ -232,6 +256,62 @@ zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd) EXPORT_SYMBOL(zone_dataset_attach); int +zone_dataset_attach_uid(cred_t *cred, const char *dataset, uid_t owner_uid) +{ +#ifdef CONFIG_USER_NS + zone_uid_datasets_t *zuds; + zone_dataset_t *zd; + int error; + size_t dsnamelen; + kuid_t kowner; + + /* Only root can attach datasets to UIDs */ + if ((error = zone_dataset_cred_check(cred)) != 0) + return (error); + if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) + return (error); + + kowner = make_kuid(current_user_ns(), owner_uid); + if (!uid_valid(kowner)) + return (EINVAL); + + mutex_enter(&zone_datasets_lock); + + /* Find or create UID entry */ + zuds = zone_uid_datasets_lookup(kowner); + if (zuds == NULL) { + zuds = kmem_alloc(sizeof (zone_uid_datasets_t), KM_SLEEP); + INIT_LIST_HEAD(&zuds->zuds_list); + INIT_LIST_HEAD(&zuds->zuds_datasets); + zuds->zuds_owner = kowner; + list_add_tail(&zuds->zuds_list, &zone_uid_datasets); + } else { + /* Check if dataset already attached */ + list_for_each_entry(zd, &zuds->zuds_datasets, zd_list) { + if (zd->zd_dsnamelen == dsnamelen && + strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) { + mutex_exit(&zone_datasets_lock); + return (EEXIST); + } + } + } + + /* Add dataset to UID's list */ + zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP); + zd->zd_dsnamelen = dsnamelen; + strlcpy(zd->zd_dsname, dataset, dsnamelen + 1); + INIT_LIST_HEAD(&zd->zd_list); + list_add_tail(&zd->zd_list, &zuds->zuds_datasets); + + mutex_exit(&zone_datasets_lock); + return (0); +#else + return (ENXIO); +#endif /* CONFIG_USER_NS */ +} +EXPORT_SYMBOL(zone_dataset_attach_uid); + +int zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd) { #ifdef CONFIG_USER_NS @@ -280,6 +360,217 @@ zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd) } EXPORT_SYMBOL(zone_dataset_detach); +int +zone_dataset_detach_uid(cred_t *cred, const char *dataset, uid_t owner_uid) +{ +#ifdef CONFIG_USER_NS + zone_uid_datasets_t *zuds; + zone_dataset_t *zd; + int error; + size_t dsnamelen; + kuid_t kowner; + + if ((error = zone_dataset_cred_check(cred)) != 0) + return (error); + if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0) + return (error); + + kowner = make_kuid(current_user_ns(), owner_uid); + if (!uid_valid(kowner)) + return (EINVAL); + + mutex_enter(&zone_datasets_lock); + + zuds = zone_uid_datasets_lookup(kowner); + if (zuds == NULL) { + mutex_exit(&zone_datasets_lock); + return (ENOENT); + } + + /* Find and remove dataset */ + list_for_each_entry(zd, &zuds->zuds_datasets, zd_list) { + if (zd->zd_dsnamelen == dsnamelen && + strncmp(zd->zd_dsname, dataset, dsnamelen) == 0) { + list_del(&zd->zd_list); + kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); + + /* Remove UID entry if no more datasets */ + if (list_empty(&zuds->zuds_datasets)) { + list_del(&zuds->zuds_list); + kmem_free(zuds, sizeof (*zuds)); + } + + mutex_exit(&zone_datasets_lock); + return (0); + } + } + + mutex_exit(&zone_datasets_lock); + return (ENOENT); +#else + return (ENXIO); +#endif /* CONFIG_USER_NS */ +} +EXPORT_SYMBOL(zone_dataset_detach_uid); + +/* + * Callback for looking up zoned_uid property (registered by ZFS module). + */ +static zone_get_zoned_uid_fn_t zone_get_zoned_uid_fn = NULL; + +void +zone_register_zoned_uid_callback(zone_get_zoned_uid_fn_t fn) +{ + zone_get_zoned_uid_fn = fn; +} +EXPORT_SYMBOL(zone_register_zoned_uid_callback); + +void +zone_unregister_zoned_uid_callback(void) +{ + zone_get_zoned_uid_fn = NULL; +} +EXPORT_SYMBOL(zone_unregister_zoned_uid_callback); + +#ifdef CONFIG_USER_NS +/* + * Check if a dataset is the delegation root (has zoned_uid set locally). + */ +static boolean_t +zone_dataset_is_zoned_uid_root(const char *dataset, uid_t zoned_uid) +{ + char *root; + uid_t found_uid; + boolean_t is_root; + + if (zone_get_zoned_uid_fn == NULL) + return (B_FALSE); + + root = kmem_alloc(MAXPATHLEN, KM_SLEEP); + found_uid = zone_get_zoned_uid_fn(dataset, root, MAXPATHLEN); + is_root = (found_uid == zoned_uid && strcmp(root, dataset) == 0); + kmem_free(root, MAXPATHLEN); + return (is_root); +} +#endif /* CONFIG_USER_NS */ + +/* + * Core authorization check for zoned_uid write delegation. + */ +zone_admin_result_t +zone_dataset_admin_check(const char *dataset, zone_uid_op_t op, + const char *aux_dataset) +{ +#ifdef CONFIG_USER_NS + struct user_namespace *user_ns; + char *delegation_root; + uid_t zoned_uid, ns_owner_uid; + int write_unused; + zone_admin_result_t result = ZONE_ADMIN_NOT_APPLICABLE; + + /* Step 1: If in global zone, not applicable */ + if (INGLOBALZONE(curproc)) + return (ZONE_ADMIN_NOT_APPLICABLE); + + /* Step 2: Need callback to be registered */ + if (zone_get_zoned_uid_fn == NULL) + return (ZONE_ADMIN_NOT_APPLICABLE); + + delegation_root = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* Step 3: Find delegation root */ + zoned_uid = zone_get_zoned_uid_fn(dataset, delegation_root, + MAXPATHLEN); + if (zoned_uid == 0) + goto out; + + /* Step 4: Verify namespace owner matches */ + user_ns = current_user_ns(); + ns_owner_uid = from_kuid(&init_user_ns, user_ns->owner); + if (ns_owner_uid != zoned_uid) + goto out; + + /* Step 5: Tiered capability check based on operation class */ + { + int required_cap; + switch (op) { + case ZONE_OP_DESTROY: + case ZONE_OP_RENAME: + case ZONE_OP_CLONE: + required_cap = CAP_SYS_ADMIN; + break; + case ZONE_OP_CREATE: + case ZONE_OP_SNAPSHOT: + case ZONE_OP_SETPROP: + required_cap = CAP_FOWNER; + break; + default: + required_cap = CAP_SYS_ADMIN; + break; + } + if (!ns_capable(user_ns, required_cap)) { + result = ZONE_ADMIN_DENIED; + goto out; + } + } + + /* Step 6: Operation-specific constraints */ + switch (op) { + case ZONE_OP_DESTROY: + /* Cannot destroy the delegation root itself */ + if (zone_dataset_is_zoned_uid_root(dataset, zoned_uid)) { + result = ZONE_ADMIN_DENIED; + goto out; + } + break; + + case ZONE_OP_RENAME: + /* Cannot rename outside delegation subtree */ + if (aux_dataset != NULL) { + char *dst_root; + uid_t dst_uid; + + dst_root = kmem_alloc(MAXPATHLEN, KM_SLEEP); + dst_uid = zone_get_zoned_uid_fn(aux_dataset, + dst_root, MAXPATHLEN); + if (dst_uid != zoned_uid || + strcmp(dst_root, delegation_root) != 0) { + kmem_free(dst_root, MAXPATHLEN); + result = ZONE_ADMIN_DENIED; + goto out; + } + kmem_free(dst_root, MAXPATHLEN); + } + break; + + case ZONE_OP_CLONE: + /* Clone source must be visible */ + if (aux_dataset != NULL) { + if (!zone_dataset_visible(aux_dataset, &write_unused)) { + result = ZONE_ADMIN_DENIED; + goto out; + } + } + break; + + case ZONE_OP_CREATE: + case ZONE_OP_SNAPSHOT: + case ZONE_OP_SETPROP: + /* No additional constraints */ + break; + } + + result = ZONE_ADMIN_ALLOWED; +out: + kmem_free(delegation_root, MAXPATHLEN); + return (result); +#else + (void) dataset, (void) op, (void) aux_dataset; + return (ZONE_ADMIN_NOT_APPLICABLE); +#endif +} +EXPORT_SYMBOL(zone_dataset_admin_check); + /* * A dataset is visible if: * - It is a parent of a namespace entry. @@ -293,34 +584,19 @@ EXPORT_SYMBOL(zone_dataset_detach); * The parent datasets of namespace entries are visible and * read-only to provide a path back to the root of the pool. */ -int -zone_dataset_visible(const char *dataset, int *write) +/* + * Helper function to check if a dataset matches against a list of + * delegated datasets. Returns visibility and sets write permission. + */ +static int +zone_dataset_check_list(struct list_head *datasets, const char *dataset, + size_t dsnamelen, int *write) { - zone_datasets_t *zds; zone_dataset_t *zd; - size_t dsnamelen, zd_len; - int visible; - - /* Default to read-only, in case visible is returned. */ - if (write != NULL) - *write = 0; - if (zone_dataset_name_check(dataset, &dsnamelen) != 0) - return (0); - if (INGLOBALZONE(curproc)) { - if (write != NULL) - *write = 1; - return (1); - } + size_t zd_len; + int visible = 0; - mutex_enter(&zone_datasets_lock); - zds = zone_datasets_lookup(crgetzoneid(curproc->cred)); - if (zds == NULL) { - mutex_exit(&zone_datasets_lock); - return (0); - } - - visible = 0; - list_for_each_entry(zd, &zds->zds_datasets, zd_list) { + list_for_each_entry(zd, datasets, zd_list) { zd_len = strlen(zd->zd_dsname); if (zd_len > dsnamelen) { /* @@ -352,7 +628,8 @@ zone_dataset_visible(const char *dataset, int *write) * the namespace entry. */ visible = memcmp(zd->zd_dsname, dataset, - zd_len) == 0 && dataset[zd_len] == '/'; + zd_len) == 0 && (dataset[zd_len] == '/' || + dataset[zd_len] == '@' || dataset[zd_len] == '#'); if (visible) { if (write != NULL) *write = 1; @@ -361,9 +638,70 @@ zone_dataset_visible(const char *dataset, int *write) } } - mutex_exit(&zone_datasets_lock); return (visible); } + +#if defined(CONFIG_USER_NS) +/* + * Check UID-based zoning visibility for the current process. + * Must be called with zone_datasets_lock held. + */ +static int +zone_dataset_visible_uid(const char *dataset, size_t dsnamelen, int *write) +{ + zone_uid_datasets_t *zuds; + + zuds = zone_uid_datasets_lookup(curproc->cred->user_ns->owner); + if (zuds != NULL) + return (zone_dataset_check_list(&zuds->zuds_datasets, dataset, + dsnamelen, write)); + return (0); +} +#endif + +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_datasets_t *zds; + size_t dsnamelen; + int visible; + + /* Default to read-only, in case visible is returned. */ + if (write != NULL) + *write = 0; + if (zone_dataset_name_check(dataset, &dsnamelen) != 0) + return (0); + if (INGLOBALZONE(curproc)) { + if (write != NULL) + *write = 1; + return (1); + } + + mutex_enter(&zone_datasets_lock); + + /* First, check namespace-specific zoning (existing behavior) */ + zds = zone_datasets_lookup(crgetzoneid(curproc->cred)); + if (zds != NULL) { + visible = zone_dataset_check_list(&zds->zds_datasets, dataset, + dsnamelen, write); + if (visible) { + mutex_exit(&zone_datasets_lock); + return (visible); + } + } + + /* Second, check UID-based zoning */ +#if defined(CONFIG_USER_NS) + visible = zone_dataset_visible_uid(dataset, dsnamelen, write); + if (visible) { + mutex_exit(&zone_datasets_lock); + return (visible); + } +#endif + + mutex_exit(&zone_datasets_lock); + return (0); +} EXPORT_SYMBOL(zone_dataset_visible); unsigned int @@ -395,8 +733,9 @@ EXPORT_SYMBOL(crgetzoneid); boolean_t inglobalzone(proc_t *proc) { + (void) proc; #if defined(CONFIG_USER_NS) - return (proc->cred->user_ns == &init_user_ns); + return (current_user_ns() == &init_user_ns); #else return (B_TRUE); #endif @@ -408,6 +747,7 @@ spl_zone_init(void) { mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL); INIT_LIST_HEAD(&zone_datasets); + INIT_LIST_HEAD(&zone_uid_datasets); return (0); } @@ -415,6 +755,7 @@ void spl_zone_fini(void) { zone_datasets_t *zds; + zone_uid_datasets_t *zuds; zone_dataset_t *zd; /* @@ -423,6 +764,22 @@ spl_zone_fini(void) * namespace is destroyed, just do it here, since spl is about to go * out of context. */ + + /* Clean up UID-based delegations */ + while (!list_empty(&zone_uid_datasets)) { + zuds = list_entry(zone_uid_datasets.next, + zone_uid_datasets_t, zuds_list); + while (!list_empty(&zuds->zuds_datasets)) { + zd = list_entry(zuds->zuds_datasets.next, + zone_dataset_t, zd_list); + list_del(&zd->zd_list); + kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1); + } + list_del(&zuds->zuds_list); + kmem_free(zuds, sizeof (*zuds)); + } + + /* Clean up namespace-based delegations */ while (!list_empty(&zone_datasets)) { zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list); while (!list_empty(&zds->zds_datasets)) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index 6478c834b7a5..dbc9aad936bf 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -410,6 +410,22 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) return (0); } +int +param_set_l2arc_dwpd_limit(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t old_val = l2arc_dwpd_limit; + int error; + + error = spl_param_set_u64(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + if (l2arc_dwpd_limit != old_val) + l2arc_dwpd_bump_reset(); + + return (0); +} + #ifdef CONFIG_MEMORY_HOTPLUG static int arc_hotplug_callback(struct notifier_block *self, unsigned long action, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c index d6323fd56a8f..91010bdf642a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c @@ -39,8 +39,10 @@ #include <sys/dsl_prop.h> #include <sys/fm/util.h> #include <sys/dsl_scan.h> +#include <sys/dmu.h> #include <sys/fs/zfs.h> #include <sys/kstat.h> +#include <sys/zone.h> #include "zfs_prop.h" @@ -122,16 +124,60 @@ spa_history_zone(void) return ("linux"); } +static int +spa_restore_zoned_uid_cb(const char *dsname, void *arg) +{ + (void) arg; + uint64_t zoned_uid = 0; + + if (dsl_prop_get(dsname, "zoned_uid", 8, 1, &zoned_uid, NULL) != 0) + return (0); + + if (zoned_uid != 0) { + int err = zone_dataset_attach_uid(kcred, dsname, + (uid_t)zoned_uid); + if (err != 0 && err != EEXIST) { + cmn_err(CE_WARN, "failed to restore zoned_uid for " + "'%s' (uid %llu): %d", dsname, + (unsigned long long)zoned_uid, err); + } + } + return (0); +} + void spa_import_os(spa_t *spa) { - (void) spa; + (void) dmu_objset_find(spa_name(spa), + spa_restore_zoned_uid_cb, NULL, DS_FIND_CHILDREN); +} + +static int +spa_cleanup_zoned_uid_cb(const char *dsname, void *arg) +{ + (void) arg; + uint64_t zoned_uid = 0; + + if (dsl_prop_get(dsname, "zoned_uid", 8, 1, &zoned_uid, NULL) != 0) + return (0); + + if (zoned_uid != 0) { + int err = zone_dataset_detach_uid(kcred, dsname, + (uid_t)zoned_uid); + if (err != 0 && err != ENOENT) { + cmn_err(CE_WARN, "failed to detach zoned_uid for " + "'%s' (uid %llu): %d", dsname, + (unsigned long long)zoned_uid, err); + } + } + return (0); } void spa_export_os(spa_t *spa) { - (void) spa; + (void) dmu_objset_find(spa_name(spa), + spa_cleanup_zoned_uid_cb, NULL, DS_FIND_CHILDREN); } void diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 1bd3500e9f66..66e10584ab5e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -445,7 +445,14 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); /* Inform the ZIO pipeline that we are non-rotational */ +#ifdef HAVE_BLK_QUEUE_ROT + v->vdev_nonrot = !blk_queue_rot(bdev_get_queue(bdev)); +#else v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); +#endif + + /* Is backed by a block device. */ + v->vdev_is_blkdev = B_TRUE; /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(bdev); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index 79fd8911102d..c73ef86df4dc 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -251,16 +251,7 @@ snapentry_compare_by_name(const void *a, const void *b) { const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; - int ret; - - ret = strcmp(se_a->se_name, se_b->se_name); - - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); + return (TREE_ISIGN(strcmp(se_a->se_name, se_b->se_name))); } /* @@ -272,15 +263,10 @@ snapentry_compare_by_objsetid(const void *a, const void *b) const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; - if (se_a->se_spa != se_b->se_spa) - return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1); - - if (se_a->se_objsetid < se_b->se_objsetid) - return (-1); - else if (se_a->se_objsetid > se_b->se_objsetid) - return (1); - else - return (0); + int cmp = TREE_PCMP(se_a->se_spa, se_b->se_spa); + if (cmp != 0) + return (cmp); + return (TREE_CMP(se_a->se_objsetid, se_b->se_objsetid)); } /* @@ -1201,8 +1187,10 @@ zfsctl_snapshot_mount(struct path *path, int flags) error = zfsctl_snapshot_name(zfsvfs, dname(dentry), ZFS_MAX_DATASET_NAME_LEN, full_name); - if (error) + if (error) { + zfs_exit(zfsvfs, FTAG); goto error; + } if (is_current_chrooted() == 0) { /* @@ -1220,6 +1208,7 @@ zfsctl_snapshot_mount(struct path *path, int flags) error = get_root_path(&mnt_path, m, MAXPATHLEN); if (error != 0) { kmem_free(m, MAXPATHLEN); + zfs_exit(zfsvfs, FTAG); goto error; } mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); @@ -1253,6 +1242,33 @@ zfsctl_snapshot_mount(struct path *path, int flags) zfs_snapshot_no_setuid ? "nosuid" : "suid"); /* + * Release z_teardown_lock before potentially blocking operations + * (cv_wait for concurrent mounts, call_usermodehelper for the mount + * helper). Holding z_teardown_lock(R) across call_usermodehelper + * deadlocks with namespace_sem: the mount helper needs + * namespace_sem(W) via move_mount, while /proc/self/mountinfo + * readers hold namespace_sem(R) and need z_teardown_lock(R) via + * zpl_show_devname. A concurrent zfs_suspend_fs queuing + * z_teardown_lock(W) blocks new readers, completing the cycle. + * See https://github.com/openzfs/zfs/issues/18409 + * + * Releasing the lock allows zfs_suspend_fs to proceed during + * the mount, so dmu_objset_hold in zpl_get_tree can transiently + * fail with ENOENT during the clone swap. The mount helper + * fails, this function returns EISDIR, and the VFS silently + * falls back to the ctldir stub (empty directory). The caller + * gets the stub inode instead of the real snapshot root until + * the next access retries the automount. + * + * Safe because everything below operates on local string copies + * (full_name, full_path) or uses its own synchronization + * (zfs_snapshot_lock, se_mtx). The parent zfsvfs pointer + * remains valid because we hold a path reference to the + * automount trigger dentry. + */ + zfs_exit(zfsvfs, FTAG); + + /* * Check if snapshot is already being mounted. If found, wait for * pending mount to complete before returning success. */ @@ -1366,8 +1382,7 @@ zfsctl_snapshot_mount(struct path *path, int flags) error: kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); kmem_free(full_path, MAXPATHLEN); - - zfs_exit(zfsvfs, FTAG); + kmem_free(options, 7); return (error); } @@ -1379,17 +1394,31 @@ int zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen, struct inode **ipp) { + zfsvfs_t *zfsvfs = sb->s_fs_info; int error; struct path path; char *mnt; struct dentry *dentry; + zfs_snapentry_t *se; mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); - error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid, - MAXPATHLEN, mnt); - if (error) - goto out; + /* + * Try the in-memory AVL tree first for previously mounted + * snapshots, falling back to the on-disk scan if not found. + */ + rw_enter(&zfs_snapshot_lock, RW_READER); + se = zfsctl_snapshot_find_by_objsetid(zfsvfs->z_os->os_spa, objsetid); + rw_exit(&zfs_snapshot_lock); + if (se != NULL) { + strlcpy(mnt, se->se_path, MAXPATHLEN); + zfsctl_snapshot_rele(se); + } else { + error = zfsctl_snapshot_path_objset(zfsvfs, objsetid, + MAXPATHLEN, mnt); + if (error) + goto out; + } /* Trigger automount */ error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c index 5421a441b323..ce6092be1da7 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c @@ -170,6 +170,8 @@ zfs_ioc_userns_attach(zfs_cmd_t *zc) */ if (error == ENOTTY) error = ZFS_ERR_NOT_USER_NAMESPACE; + if (error == ENXIO) + error = ZFS_ERR_NO_USER_NS_SUPPORT; return (error); } @@ -190,6 +192,8 @@ zfs_ioc_userns_detach(zfs_cmd_t *zc) */ if (error == ENOTTY) error = ZFS_ERR_NOT_USER_NAMESPACE; + if (error == ENXIO) + error = ZFS_ERR_NO_USER_NS_SUPPORT; return (error); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 8a7d14ab6119..9c0d92551843 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2026, TrueNAS. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -64,53 +65,15 @@ #include <linux/fs.h> #include "zfs_comutil.h" -enum { - TOKEN_RO, - TOKEN_RW, - TOKEN_SETUID, - TOKEN_NOSETUID, - TOKEN_EXEC, - TOKEN_NOEXEC, - TOKEN_DEVICES, - TOKEN_NODEVICES, - TOKEN_DIRXATTR, - TOKEN_SAXATTR, - TOKEN_XATTR, - TOKEN_NOXATTR, - TOKEN_ATIME, - TOKEN_NOATIME, - TOKEN_RELATIME, - TOKEN_NORELATIME, - TOKEN_NBMAND, - TOKEN_NONBMAND, - TOKEN_MNTPOINT, - TOKEN_LAST, -}; - -static const match_table_t zpl_tokens = { - { TOKEN_RO, MNTOPT_RO }, - { TOKEN_RW, MNTOPT_RW }, - { TOKEN_SETUID, MNTOPT_SETUID }, - { TOKEN_NOSETUID, MNTOPT_NOSETUID }, - { TOKEN_EXEC, MNTOPT_EXEC }, - { TOKEN_NOEXEC, MNTOPT_NOEXEC }, - { TOKEN_DEVICES, MNTOPT_DEVICES }, - { TOKEN_NODEVICES, MNTOPT_NODEVICES }, - { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, - { TOKEN_SAXATTR, MNTOPT_SAXATTR }, - { TOKEN_XATTR, MNTOPT_XATTR }, - { TOKEN_NOXATTR, MNTOPT_NOXATTR }, - { TOKEN_ATIME, MNTOPT_ATIME }, - { TOKEN_NOATIME, MNTOPT_NOATIME }, - { TOKEN_RELATIME, MNTOPT_RELATIME }, - { TOKEN_NORELATIME, MNTOPT_NORELATIME }, - { TOKEN_NBMAND, MNTOPT_NBMAND }, - { TOKEN_NONBMAND, MNTOPT_NONBMAND }, - { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, - { TOKEN_LAST, NULL }, -}; +vfs_t * +zfsvfs_vfs_alloc(void) +{ + vfs_t *vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); + mutex_init(&vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL); + return (vfsp); +} -static void +void zfsvfs_vfs_free(vfs_t *vfsp) { if (vfsp != NULL) { @@ -121,139 +84,6 @@ zfsvfs_vfs_free(vfs_t *vfsp) } } -static int -zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) -{ - switch (token) { - case TOKEN_RO: - vfsp->vfs_readonly = B_TRUE; - vfsp->vfs_do_readonly = B_TRUE; - break; - case TOKEN_RW: - vfsp->vfs_readonly = B_FALSE; - vfsp->vfs_do_readonly = B_TRUE; - break; - case TOKEN_SETUID: - vfsp->vfs_setuid = B_TRUE; - vfsp->vfs_do_setuid = B_TRUE; - break; - case TOKEN_NOSETUID: - vfsp->vfs_setuid = B_FALSE; - vfsp->vfs_do_setuid = B_TRUE; - break; - case TOKEN_EXEC: - vfsp->vfs_exec = B_TRUE; - vfsp->vfs_do_exec = B_TRUE; - break; - case TOKEN_NOEXEC: - vfsp->vfs_exec = B_FALSE; - vfsp->vfs_do_exec = B_TRUE; - break; - case TOKEN_DEVICES: - vfsp->vfs_devices = B_TRUE; - vfsp->vfs_do_devices = B_TRUE; - break; - case TOKEN_NODEVICES: - vfsp->vfs_devices = B_FALSE; - vfsp->vfs_do_devices = B_TRUE; - break; - case TOKEN_DIRXATTR: - vfsp->vfs_xattr = ZFS_XATTR_DIR; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_SAXATTR: - vfsp->vfs_xattr = ZFS_XATTR_SA; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_XATTR: - vfsp->vfs_xattr = ZFS_XATTR_SA; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_NOXATTR: - vfsp->vfs_xattr = ZFS_XATTR_OFF; - vfsp->vfs_do_xattr = B_TRUE; - break; - case TOKEN_ATIME: - vfsp->vfs_atime = B_TRUE; - vfsp->vfs_do_atime = B_TRUE; - break; - case TOKEN_NOATIME: - vfsp->vfs_atime = B_FALSE; - vfsp->vfs_do_atime = B_TRUE; - break; - case TOKEN_RELATIME: - vfsp->vfs_relatime = B_TRUE; - vfsp->vfs_do_relatime = B_TRUE; - break; - case TOKEN_NORELATIME: - vfsp->vfs_relatime = B_FALSE; - vfsp->vfs_do_relatime = B_TRUE; - break; - case TOKEN_NBMAND: - vfsp->vfs_nbmand = B_TRUE; - vfsp->vfs_do_nbmand = B_TRUE; - break; - case TOKEN_NONBMAND: - vfsp->vfs_nbmand = B_FALSE; - vfsp->vfs_do_nbmand = B_TRUE; - break; - case TOKEN_MNTPOINT: - if (vfsp->vfs_mntpoint != NULL) - kmem_strfree(vfsp->vfs_mntpoint); - vfsp->vfs_mntpoint = match_strdup(&args[0]); - if (vfsp->vfs_mntpoint == NULL) - return (SET_ERROR(ENOMEM)); - break; - default: - break; - } - - return (0); -} - -/* - * Parse the raw mntopts and return a vfs_t describing the options. - */ -static int -zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) -{ - vfs_t *tmp_vfsp; - int error; - - tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); - mutex_init(&tmp_vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL); - - if (mntopts != NULL) { - substring_t args[MAX_OPT_ARGS]; - char *tmp_mntopts, *p, *t; - int token; - - tmp_mntopts = t = kmem_strdup(mntopts); - if (tmp_mntopts == NULL) - return (SET_ERROR(ENOMEM)); - - while ((p = strsep(&t, ",")) != NULL) { - if (!*p) - continue; - - args[0].to = args[0].from = NULL; - token = match_token(p, zpl_tokens, args); - error = zfsvfs_parse_option(p, token, args, tmp_vfsp); - if (error) { - kmem_strfree(tmp_mntopts); - zfsvfs_vfs_free(tmp_vfsp); - return (error); - } - } - - kmem_strfree(tmp_mntopts); - } - - *vfsp = tmp_vfsp; - - return (0); -} - boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { @@ -1486,20 +1316,16 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) static atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); int -zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) +zfs_domount(struct super_block *sb, const char *osname, + vfs_t *vfs, int silent) { - const char *osname = zm->mnt_osname; struct inode *root_inode = NULL; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; - vfs_t *vfs = NULL; int canwrite; int dataset_visible_zone; - ASSERT(zm); - ASSERT(osname); - dataset_visible_zone = zone_dataset_visible(osname, &canwrite); /* @@ -1511,10 +1337,6 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) return (SET_ERROR(EPERM)); } - error = zfsvfs_parse_options(zm->mnt_data, &vfs); - if (error) - return (error); - /* * If a non-writable filesystem is being mounted without the * read-only flag, pretend it was set, as done for snapshots. @@ -1523,16 +1345,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) vfs->vfs_readonly = B_TRUE; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); - if (error) { - zfsvfs_vfs_free(vfs); + if (error) goto out; - } if ((error = dsl_prop_get_integer(osname, "recordsize", - &recordsize, NULL))) { - zfsvfs_vfs_free(vfs); + &recordsize, NULL))) goto out; - } vfs->vfs_data = zfsvfs; zfsvfs->z_vfs = vfs; @@ -1614,6 +1432,13 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) out: if (error) { if (zfsvfs != NULL) { + /* + * We're returning error, so the caller still owns + * the mount options vfs_t. Remove them from zfsvfs + * so we don't try to free them. + */ + zfsvfs->z_vfs = NULL; + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } @@ -1704,24 +1529,16 @@ zfs_umount(struct super_block *sb) } int -zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) +zfs_remount(struct super_block *sb, vfs_t *vfsp, int flags) { zfsvfs_t *zfsvfs = sb->s_fs_info; - vfs_t *vfsp; boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); - int error; if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && - !(*flags & SB_RDONLY)) { - *flags |= SB_RDONLY; + !(flags & SB_RDONLY)) return (EROFS); - } - error = zfsvfs_parse_options(zm->mnt_data, &vfsp); - if (error) - return (error); - - if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) + if (!zfs_is_readonly(zfsvfs) && (flags & SB_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); zfs_unregister_callbacks(zfsvfs); @@ -1732,7 +1549,7 @@ zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) if (!issnap) (void) zfs_register_callbacks(vfsp); - return (error); + return (0); } int @@ -1963,15 +1780,6 @@ bail: /* release the VFS ops */ rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); - - if (err != 0) { - /* - * Since we couldn't setup the sa framework, try to force - * unmount this file system. - */ - if (zfsvfs->z_os) - (void) zfs_umount(zfsvfs->z_sb); - } return (err); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index a788e3fd4862..e65f81230124 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -200,8 +200,9 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) * Keep a count of the synchronous opens in the znode. On first * synchronous open we must convert all previous async transactions * into sync to keep correct ordering. + * Skip it for snapshot, as it won't have any transactions. */ - if (flag & O_SYNC) { + if (!zfsvfs->z_issnap && (flag & O_SYNC)) { if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) zil_async_to_sync(zfsvfs->z_log, zp->z_id); } @@ -222,7 +223,7 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) return (error); /* Decrement the synchronous opens in the znode */ - if (flag & O_SYNC) + if (!zfsvfs->z_issnap && (flag & O_SYNC)) atomic_dec_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); @@ -2581,8 +2582,19 @@ top: if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - if (mask != 0) + if (mask != 0) { zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + /* + * Ensure that the z_seq is always incremented on setattr + * operation. This is required for change accounting for + * NFS clients. + * + * ATTR_MODE already increments via zfs_acl_chmod_setattr. + * ATTR_SIZE already increments via zfs_freesp. + */ + if (!(mask & (ATTR_MODE | ATTR_SIZE))) + zp->z_seq++; + } mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) @@ -3513,7 +3525,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, boolean_t is_tmpfile = 0; uint64_t txg; - is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); + is_tmpfile = (sip->i_nlink == 0 && + (inode_state_read_once(sip) & I_LINKABLE)); ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c index 711da151f65e..0568bb63c75e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c @@ -37,8 +37,8 @@ zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent) { fstrans_cookie_t cookie; ushort_t empty_fid = 0; - fid_t *fid; - int len_bytes, rc; + fid_t *fid, *pfid; + int len_bytes, required_len, parent_len, rc, prc, fh_type; len_bytes = *max_len * sizeof (__u32); @@ -56,11 +56,44 @@ zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent) else rc = zfs_fid(ip, fid); + required_len = offsetof(fid_t, fid_data) + fid->fid_len; + + /* + * Kernel has requested that the resulting file handle contain + * a reference to the provided parent. This typically would happen + * if the NFS export has subtree checking enabled. + */ + if (parent != NULL) { + if ((rc == 0) && (len_bytes > + required_len + offsetof(fid_t, fid_data))) { + parent_len = len_bytes - required_len; + pfid = (fid_t *)((char *)fh + required_len); + pfid->fid_len = parent_len - offsetof(fid_t, fid_data); + } else { + empty_fid = 0; + pfid = (fid_t *)&empty_fid; + } + + if (zfsctl_is_node(parent)) + prc = zfsctl_fid(parent, pfid); + else + prc = zfs_fid(parent, pfid); + + if (rc == 0 && prc != 0) + rc = prc; + + required_len += offsetof(fid_t, fid_data) + + pfid->fid_len; + fh_type = FILEID_INO32_GEN_PARENT; + } else { + fh_type = FILEID_INO32_GEN; + } + spl_fstrans_unmark(cookie); - len_bytes = offsetof(fid_t, fid_data) + fid->fid_len; - *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32); - return (rc == 0 ? FILEID_INO32_GEN : 255); + *max_len = roundup(required_len, sizeof (__u32)) / sizeof (__u32); + + return (rc == 0 ? fh_type : FILEID_INVALID); } static struct dentry * @@ -74,7 +107,8 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, len_bytes = fh_len * sizeof (__u32); - if (fh_type != FILEID_INO32_GEN || + if ((fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) || len_bytes < offsetof(fid_t, fid_data) || len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) return (ERR_PTR(-EINVAL)); @@ -104,6 +138,46 @@ zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, return (d_obtain_alias(ip)); } +static struct dentry * +zpl_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + /* + * Convert the provided struct fid to a dentry for the parent + * This is possible only if it was created with the parent, + * e.g. type is FILEID_INO32_GEN_PARENT. When this type of + * filehandle is created we simply pack the parent fid_t + * after the entry's fid_t. So this function will adjust + * offset in the provided buffer to the begining of the + * parent fid_t and call zpl_fh_to_dentry() on it. + */ + fid_t *fid = (fid_t *)fh; + fid_t *pfid; + int len_bytes, parent_len_bytes, child_fid_bytes, parent_fh_len; + + len_bytes = fh_len * sizeof (__u32); + + if ((fh_type != FILEID_INO32_GEN_PARENT) || + len_bytes < offsetof(fid_t, fid_data) || + len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) + return (ERR_PTR(-EINVAL)); + + child_fid_bytes = offsetof(fid_t, fid_data) + fid->fid_len; + parent_len_bytes = len_bytes - child_fid_bytes; + + if (parent_len_bytes < offsetof(fid_t, fid_data)) + return (ERR_PTR(-EINVAL)); + + pfid = (fid_t *)((char *)fh + child_fid_bytes); + + if (parent_len_bytes < offsetof(fid_t, fid_data) + pfid->fid_len) + return (ERR_PTR(-EINVAL)); + + parent_fh_len = parent_len_bytes / sizeof (__u32); + return (zpl_fh_to_dentry(sb, (struct fid *)pfid, parent_fh_len, + FILEID_INO32_GEN)); +} + /* * In case the filesystem contains name longer than 255, we need to override * the default get_name so we don't get buffer overflow. Unfortunately, since @@ -177,6 +251,7 @@ zpl_commit_metadata(struct inode *inode) const struct export_operations zpl_export_operations = { .encode_fh = zpl_encode_fh, .fh_to_dentry = zpl_fh_to_dentry, + .fh_to_parent = zpl_fh_to_parent, .get_name = zpl_get_name, .get_parent = zpl_get_parent, .commit_metadata = zpl_commit_metadata, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index f7691c02d163..ffe227796f0a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -43,6 +43,9 @@ #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include <linux/writeback.h> #endif +#ifdef HAVE_FILELOCK_HEADER +#include <linux/filelock.h> +#endif /* * When using fallocate(2) to preallocate space, inflate the requested @@ -776,34 +779,23 @@ zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - switch (advice) { - case POSIX_FADV_SEQUENTIAL: - case POSIX_FADV_WILLNEED: + if (advice == POSIX_FADV_WILLNEED) { + loff_t rlen = len ? len : i_size_read(ip) - offset; + dmu_prefetch(os, zp->z_id, 0, offset, rlen, + ZIO_PRIORITY_ASYNC_READ); + if (!zn_has_cached_data(zp, offset, offset + rlen - 1)) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + } + #ifdef HAVE_GENERIC_FADVISE - if (zn_has_cached_data(zp, offset, offset + len - 1)) - error = generic_fadvise(filp, offset, len, advice); + error = generic_fadvise(filp, offset, len, advice); #endif - /* - * Pass on the caller's size directly, but note that - * dmu_prefetch_max will effectively cap it. If there - * really is a larger sequential access pattern, perhaps - * dmu_zfetch will detect it. - */ - if (len == 0) - len = i_size_read(ip) - offset; - dmu_prefetch(os, zp->z_id, 0, offset, len, - ZIO_PRIORITY_ASYNC_READ); - break; - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_DONTNEED: - case POSIX_FADV_NOREUSE: - /* ignored for now */ - break; - default: - error = -EINVAL; - break; + if (error == 0 && advice == POSIX_FADV_DONTNEED) { + loff_t rlen = len ? len : i_size_read(ip) - offset; + dmu_evict_range(os, zp->z_id, offset, rlen); } zfs_exit(zfsvfs, FTAG); @@ -1242,6 +1234,7 @@ const struct file_operations zpl_file_operations = { .mmap = zpl_mmap, .fsync = zpl_fsync, .fallocate = zpl_fallocate, + .setlease = generic_setlease, .copy_file_range = zpl_copy_file_range, #ifdef HAVE_VFS_CLONE_FILE_RANGE .clone_file_range = zpl_clone_file_range, @@ -1264,6 +1257,7 @@ const struct file_operations zpl_dir_file_operations = { .read = generic_read_dir, .iterate_shared = zpl_iterate, .fsync = zpl_fsync, + .setlease = generic_setlease, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index f97662d052c7..e4e15c824f4b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -506,6 +506,32 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, } #endif +#ifdef STATX_CHANGE_COOKIE + if (request_mask & STATX_CHANGE_COOKIE) { + /* + * knfsd uses the STATX_CHANGE_COOKIE to surface to clients + * change_info4 data, which is used to implement NFS client + * name caching (see RFC 8881 Section 10.8). This number + * should always increase with changes and should not be + * reused. We cannot simply present ctime here because + * ZFS uses a coarse timer to set them, which may cause + * clients to fail to detect changes and invalidate cache. + * + * ZFS always increments znode z_seq number, but this is + * uint_t and so we mask in ctime to upper bits. + * + * STATX_ATTR_CHANGE_MONOTONIC is advertised + * to prevent knfsd from generating the change cookie + * based on ctime. C.f. nfsd4_change_attribute in + * fs/nfsd/nfsfh.c. + */ + stat->change_cookie = + ((u64)stat->ctime.tv_sec << 32) | zp->z_seq; + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; + stat->result_mask |= STATX_CHANGE_COOKIE; + } +#endif + #ifdef STATX_DIOALIGN if (request_mask & STATX_DIOALIGN) { uint64_t align; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 347b352506e5..2cd0f17c860f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -24,6 +24,7 @@ * Copyright (c) 2023, Datto Inc. All rights reserved. * Copyright (c) 2025, Klara, Inc. * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + * Copyright (c) 2026, TrueNAS. */ @@ -35,6 +36,8 @@ #include <linux/iversion.h> #include <linux/version.h> #include <linux/vfs_compat.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> /* * What to do when the last reference to an inode is released. If 0, the kernel @@ -265,21 +268,6 @@ zpl_statfs(struct dentry *dentry, struct kstatfs *statp) } static int -zpl_remount_fs(struct super_block *sb, int *flags, char *data) -{ - zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; - fstrans_cookie_t cookie; - int error; - - cookie = spl_fstrans_mark(); - error = -zfs_remount(sb, flags, &zm); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { int error; @@ -354,21 +342,6 @@ zpl_show_options(struct seq_file *seq, struct dentry *root) } static int -zpl_fill_super(struct super_block *sb, void *data, int silent) -{ - zfs_mnt_t *zm = (zfs_mnt_t *)data; - fstrans_cookie_t cookie; - int error; - - cookie = spl_fstrans_mark(); - error = -zfs_domount(sb, zm, silent); - spl_fstrans_unmark(cookie); - ASSERT3S(error, <=, 0); - - return (error); -} - -static int zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; @@ -383,17 +356,477 @@ zpl_test_super(struct super_block *s, void *data) return (zfsvfs != NULL && os == zfsvfs->z_os); } -static struct super_block * -zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) +static void +zpl_kill_sb(struct super_block *sb) +{ + zfs_preumount(sb); + kill_anon_super(sb); +} + +void +zpl_prune_sb(uint64_t nr_to_scan, void *arg) +{ + struct super_block *sb = (struct super_block *)arg; + int objects = 0; + + /* + * Ensure the superblock is not in the process of being torn down. + */ +#ifdef HAVE_SB_DYING + if (down_read_trylock(&sb->s_umount)) { + if (!(sb->s_flags & SB_DYING) && sb->s_root && + (sb->s_flags & SB_BORN)) { + (void) zfs_prune(sb, nr_to_scan, &objects); + } + up_read(&sb->s_umount); + } +#else + if (down_read_trylock(&sb->s_umount)) { + if (!hlist_unhashed(&sb->s_instances) && + sb->s_root && (sb->s_flags & SB_BORN)) { + (void) zfs_prune(sb, nr_to_scan, &objects); + } + up_read(&sb->s_umount); + } +#endif +} + +/* + * Mount option parsing. + * + * The kernel receives a set of "stringy" mount options, typically a + * comma-separated list through mount(2) or fsconfig(2). These are split into a + * set of struct fs_parameter, and then vfs_parse_fs_param() is called for + * each. That function will handle (and consume) some options directly, and + * other subsystems (mainly security modules) are given the opportunity to + * consume them too. Any left over are passed to zpl_parse_param(). Our job is + * to use them to fill in the vfs_t we've attached previously to + * fc->fs_private, ready for the mount or remount call when it comes. + * + * Historically, mount options have been generated, removed, modified and + * otherwise complicated by multiple different actors over a long time: the + * kernel itself, the original mount(8) utility and later libmount, + * mount.zfs(8), libzfs and the ZFS tools that use it, and any program using + * the various mount APIs that have come and gone over the years. This is + * further complicated by cross-pollination between OpenSolaris/illumos, Linux + * and FreeBSD. Long story short: we could see all sorts of things, and we need + * to at least try not to break old userspace programs. + * + * At time of writing, this is my best understanding of all the options we + * might reasonably see, and where and how they're handled. + * + * + * These are common options for all filesystems that are processed by the + * kernel directly, without zpl_parse_param() being called. They're a bit of a + * mixed bag, but are ultimately all available to us via either sb->s_flags or + * fc->sb_flags: + * + * dirsync: set SB_DIRSYNC + * lazytime: set SB_LAZYTIME + * mand: set SB_MANDLOCK + * ro: set SB_RDONLY + * sync: set SB_SYNCHRONOUS + * + * async: clear SB_SYNCHRONOUS + * nolazytime: clear SB_LAZYTIME + * nomand: clear SB_MANDLOCK + * rw: clear SB_RDONLY + * + * Fortunately, almost all of these are handled directly by the kernel. 'mand' + * and 'nomand' are swallowed by the kernel ('mand' emits a warning in the + * kernel log), but it and the corresponding dataset property have been a no-op + * in OpenZFS for years, so there's nothing for us to do there. + * + * The only tricky one is SB_RDONLY ('ro'/'rw'), which can be both a mount and + * a superblock option. While we won't receive the "stringy" options, the + * kernel will set it for us in fc->sb_flags, and we've always had special + * handling for it at mount and remount time (eg handling snapshot mounts), so + * it's not a problem to do nothing here because we will sort it out later. + * + * + * These are options that we may receive as "stringy" options but also as mount + * flags. + * + * exec: clear MS_NOEXEC + * noexec: set MS_NOEXEC + * suid: clear MS_NOSUID + * nosuid: set MS_NOSUID + * dev: clear MS_NODEV + * nodev: set MS_NODEV + * atime: clear MS_NOATIME + * noatime: set MS_NOATIME + * relatime: set MS_RELATIME + * norelatime: clear MS_RELATIME + * + * In testing, it appears that recent libmount will convert them, but our own + * mount code (libzfs_mount) may not. We will be called for the stringy + * versions, but not for the flags. The flags will later be available on + * vfsmount->mnt_flags, not set on the vfs_t. This tends not to matter in + * practice, as almost all mounts come through libzfs (via zfs-mount(8) or + * mount.zfs(8)) and so as strings, and when they do come through flags, they + * will still be reported correctly via mountinfo and by zfs-get(8), which has + * special handling for "temporary" properties. Also, we never use these + * internally for any decisions; 'exec', 'suid' and 'dev' are handled in the + * kernel, and the kernel provides helpers for 'atime' and 'relatime'. The + * only place the difference is observable is through zfs_get_temporary_prop(), + * which is only used by the zfs.get_prop() Lua call. + * + * This is fixable by getting at vfsmount->mnt_flags, but this is not readily + * available until after the mount operation is completed, and with some + * effort. This is all very low impact, so it's left for future improvement. + * + * + * These are true OpenZFS-specific mount options. They give the equivalent + * of temporarily setting the pool properties as follows: + * + * strictatime atime=on, relatime=off + * + * xattr: xattr=sa + * saxattr: xattr=sa + * dirxattr: xattr=dir + * noxattr: xattr=off + * + * + * mntpoint= provides the canonical mount point for a snapshot mount. This + * is an assist for the snapshot automounter call out to userspace, to + * understand where the snapshot is mounted even when triggered from an + * alternate mount namespace (eg inside a chroot). + * + * mntpoint= vfs->vfs_mntpoint=... + * + * + * These are used for coordination inside libzfs, and should not make it + * to the kernel, but it does not strip them, so we handle them and ignore + * them. + * + * defaults + * zfsutil + * remount + * + * + * These are specific to SELinux. When that security module is running, it + * will consume them, but if not, they will be passed through to us. libzfs + * adds them unconditionally, so we will always see them when SELinux is not + * running, and ignore them. + * + * fscontext + * defcontext + * rootcontext + * context + * + * + * When preparing a remount, libmount will read /proc/self/mountinfo and add + * any unrecognised flags it finds there to the options. So, we have to accept + * anything that __zpl_show_options() can produce. + * + * posixacl + * noacl + * casesensitive + * caseinsensitive + * casemixed + * + * + * mount(8) has a notion of "sloppy" options. According to the documentation, + * when the -s switch is provided, unrecognised mount options will be ignored. + * Only the Linux NFS and SMB filesystems support it, and traditionally + * OpenZFS has too. however, it appears massively underspecified and + * inconsistent. Depending on the interplay between mount(8), the mount helper + * (eg mount.zfs(8)) and libmount, -s may cause unknown options to be filtered + * in userspace, _or_ an additional option 'sloppy' to be passed to the kernel + * either before or after the "unknown" option, _or_ nothing at all happens + * and the unknown option to be passed through to the kernel as-is. The + * kernel NFS and SMB filesystems both expect to see an explicit option + * 'sloppy' and use this to either ignore or reject unknown options, but as + * described, it's very easy for that option to not appear, or appear too late. + * + * OpenZFS has a test for this in the test suite, and it's documented in + * mount.zfs(8), so to support it we accept 'sloppy' and ignore it, and all + * other unknown options produce a notice in the kernel log, and are also + * ignored. This allows the "feature" to continue to work, while avoiding + * the additional housekeeping for the 'sloppy' option. + * + * sloppy + * + * + * Finally, all filesystems get automatic handling for the 'source' option, + * that is, the "name" of the filesystem (the first column of df(1)'s output). + * However, this only happens if the handler does not otherwise handle + * the 'source' option. Since we handle _all_ options because of 'sloppy', we + * deal with this explicitly by calling into the kernel's helper for this, + * vfs_parse_fs_param_source(), which sets up fc->source. + * + * source + * + * + * Thank you for reading this far. I hope you find what you are looking for, + * in this life or the next. + * + * -- robn, 2026-03-26 + */ + +enum { + Opt_exec, Opt_suid, Opt_dev, + Opt_atime, Opt_relatime, Opt_strictatime, + Opt_saxattr, Opt_dirxattr, Opt_noxattr, + Opt_mntpoint, + + Opt_ignore, Opt_warn, +}; + +static const struct fs_parameter_spec zpl_param_spec[] = { + fsparam_flag_no("exec", Opt_exec), + fsparam_flag_no("suid", Opt_suid), + fsparam_flag_no("dev", Opt_dev), + + fsparam_flag_no("atime", Opt_atime), + fsparam_flag_no("relatime", Opt_relatime), + fsparam_flag("strictatime", Opt_strictatime), + + fsparam_flag("xattr", Opt_saxattr), + fsparam_flag("saxattr", Opt_saxattr), + fsparam_flag("dirxattr", Opt_dirxattr), + fsparam_flag("noxattr", Opt_noxattr), + + fsparam_string("mntpoint", Opt_mntpoint), + + fsparam_flag("defaults", Opt_ignore), + fsparam_flag("zfsutil", Opt_ignore), + fsparam_flag("remount", Opt_ignore), + + fsparam_string("fscontext", Opt_ignore), + fsparam_string("defcontext", Opt_ignore), + fsparam_string("rootcontext", Opt_ignore), + fsparam_string("context", Opt_ignore), + + fsparam_flag("posixacl", Opt_ignore), + fsparam_flag("noacl", Opt_ignore), + fsparam_flag("casesensitive", Opt_ignore), + fsparam_flag("caseinsensitive", Opt_ignore), + fsparam_flag("casemixed", Opt_ignore), + + fsparam_flag("sloppy", Opt_ignore), + + {} +}; + +static int +zpl_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct super_block *s; + vfs_t *vfs = fc->fs_private; + + /* Handle 'source' explicitly so we don't trip on it as an unknown. */ + int opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return (opt); + + struct fs_parse_result result; + opt = fs_parse(fc, zpl_param_spec, param, &result); + if (opt == -ENOPARAM) { + /* + * Convert unknowns to warnings, to work around the whole + * "sloppy option" mess. + */ + opt = Opt_warn; + } + if (opt < 0) + return (opt); + + switch (opt) { + case Opt_exec: + vfs->vfs_exec = !result.negated; + vfs->vfs_do_exec = B_TRUE; + break; + case Opt_suid: + vfs->vfs_setuid = !result.negated; + vfs->vfs_do_setuid = B_TRUE; + break; + case Opt_dev: + vfs->vfs_devices = !result.negated; + vfs->vfs_do_devices = B_TRUE; + break; + + case Opt_atime: + vfs->vfs_atime = !result.negated; + vfs->vfs_do_atime = B_TRUE; + break; + case Opt_relatime: + vfs->vfs_relatime = !result.negated; + vfs->vfs_do_relatime = B_TRUE; + break; + case Opt_strictatime: + vfs->vfs_atime = B_TRUE; + vfs->vfs_do_atime = B_TRUE; + vfs->vfs_relatime = B_FALSE; + vfs->vfs_do_relatime = B_TRUE; + break; + + case Opt_saxattr: + vfs->vfs_xattr = ZFS_XATTR_SA; + vfs->vfs_do_xattr = B_TRUE; + break; + case Opt_dirxattr: + vfs->vfs_xattr = ZFS_XATTR_DIR; + vfs->vfs_do_xattr = B_TRUE; + break; + case Opt_noxattr: + vfs->vfs_xattr = ZFS_XATTR_OFF; + vfs->vfs_do_xattr = B_TRUE; + break; + + case Opt_mntpoint: + if (vfs->vfs_mntpoint != NULL) + kmem_strfree(vfs->vfs_mntpoint); + vfs->vfs_mntpoint = kmem_strdup(param->string); + break; + + case Opt_ignore: + break; + + case Opt_warn: + cmn_err(CE_NOTE, + "ZFS: ignoring unknown mount option: %s", param->key); + break; + + default: + return (-SET_ERROR(EINVAL)); + } + + return (0); +} + +/* + * Before Linux 5.8, the kernel's individual parameter parsing had a list of + * "forbidden" options that would always be rejected early. These were options + * that should be specified by MS_* flags, to be set on the superblock + * directly. However, it was inconsistently applied (eg it had various "*atime" + * options but not "atime", and also caused problems when it was not in sync + * with the version of libmount in use. It was deemed needlessly restrictive + * and was dropped in torvalds/linux@9193ae87a8af. + * + * Unfortunately, some of the options on this list are used by OpenZFS, so + * we need to see them. These include the aforementioned "*atime", "dev", + * "exec" and "suid". + * + * There is no easy compile-time check available to detect this, so we use + * a simple version check that should make it available everywhere needed, + * most notably RHEL8's 4.18+extras, which has backported fs_context support + * but does not include the 5.8 commit. + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 8, 0) +#define HAVE_FORBIDDEN_SB_FLAGS 1 +#endif + +#ifdef HAVE_FORBIDDEN_SB_FLAGS +/* + * The typical path for options parsing through mount(2) is: + * + * ksys_mount + * do_mount + * generic_parse_monolithic + * vfs_parse_fs_string + * vfs_parse_fs_param + * zpl_parse_param + * + * vfs_parse_fs_param() calls the internal vfs_parse_sb_flag(), which is + * where the "forbidden" flags are applied. If it makes it through there, + * it will later call fc->parse_param() ie zpl_parse_param(). We can't + * intercept this chain in the middle anywhere; the earliest thing we can + * override is generic_parse_monolithic(), substituting our own by setting + * fc->parse_monolithic and doing the parsing work ourselves. + * + * Fortunately, generic_parse_monolithic() is almost entirely splitting the + * incoming parameter string on comma and handing off to the rest of the + * pipeline. This is easily replaced (almost entirely by reviving a few bits + * of our old options parser). + * + * To keep the change as narrow as possible, we reuse zpl_param_spec and + * zpl_parse_param() as much as possible. Once we've parsed the option, we call + * fs_parse(zpl_param_spec) to find out if the option is actually one we + * explicitly care about. If it is, we call zpl_parse_param() directly, + * avoiding vfs_parse_fs_param() and so the risk of being rejected. If it is + * not one we explicitly care about, we call zpl_parse_param() as normal, + * letting the kernel reject it if it wishes. If it doesn't, it will end up + * back in zpl_parse_param() via fc->parse_param, and we can ignore or warn + * about it we normally would. + */ +static int +zpl_parse_monolithic(struct fs_context *fc, void *data) +{ + char *mntopts = data; + + if (mntopts == NULL) + return (0); + + /* + * Because we supply a .parse_monolithic callback, the kernel does + * no consideration of the options blob at all. Because of this, we + * have to give LSMs a first look at it. They will remove any options + * of interest to them (eg the SELinux *context= options). + */ + int err = security_sb_eat_lsm_opts(mntopts, &fc->security); + if (err) + return (err); + + char *key; + while ((key = strsep(&mntopts, ",")) != NULL) { + if (!*key) + continue; + + struct fs_parameter param = { + .key = key, + }; + + char *value = strchr(key, '='); + if (value != NULL) { + /* Key starts with '='. Kernel ignores, we will too. */ + if (value == key) + continue; + *value++ = '\0'; + + /* key=value is a "string" type, set up for that */ + param.string = value; + param.type = fs_value_is_string; + param.size = strlen(value); + } else { + /* unadorned key is a "flag" type */ + param.type = fs_value_is_flag; + } + + /* Check if this is one of our options. */ + struct fs_parse_result result; + int opt = fs_parse(fc, zpl_param_spec, ¶m, &result); + if (opt >= 0) { + /* + * We already know this one of our options, so a + * failure here would be nonsensical. + */ + VERIFY0(zpl_parse_param(fc, ¶m)); + } else { + /* + * Not one of our option, send it through the kernel's + * standard parameter handling. + */ + err = vfs_parse_fs_param(fc, ¶m); + if (err < 0) + return (err); + } + } + + return (0); +} +#endif /* HAVE_FORBIDDEN_SB_FLAGS */ + +static int +zpl_get_tree(struct fs_context *fc) +{ + struct super_block *sb; objset_t *os; boolean_t issnap = B_FALSE; int err; - err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); + err = dmu_objset_hold(fc->source, FTAG, &os); if (err) - return (ERR_PTR(-err)); + return (-err); /* * The dsl pool lock must be released prior to calling sget(). @@ -405,7 +838,8 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); dsl_pool_rele(dmu_objset_pool(os), FTAG); - s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); + sb = sget(fc->fs_type, zpl_test_super, set_anon_super, + fc->sb_flags, os); /* * Recheck with the lock held to prevent mounting the wrong dataset @@ -415,93 +849,161 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) * also s_umount lock is not held there so it would race with * zfs_umount and zfsvfs can be freed. */ - if (!IS_ERR(s) && s->s_fs_info != NULL) { - zfsvfs_t *zfsvfs = s->s_fs_info; + if (!IS_ERR(sb) && sb->s_fs_info != NULL) { + zfsvfs_t *zfsvfs = sb->s_fs_info; if (zpl_enter(zfsvfs, FTAG) == 0) { if (os != zfsvfs->z_os) - err = -SET_ERROR(EBUSY); + err = SET_ERROR(EBUSY); issnap = zfsvfs->z_issnap; zpl_exit(zfsvfs, FTAG); } else { - err = -SET_ERROR(EBUSY); + err = SET_ERROR(EBUSY); } } dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); - if (IS_ERR(s)) - return (ERR_CAST(s)); + if (IS_ERR(sb)) + return (PTR_ERR(sb)); if (err) { - deactivate_locked_super(s); - return (ERR_PTR(err)); + deactivate_locked_super(sb); + return (-err); } - if (s->s_root == NULL) { - err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); + if (sb->s_root == NULL) { + vfs_t *vfs = fc->fs_private; + + /* Apply readonly flag as mount option */ + if (fc->sb_flags & SB_RDONLY) { + vfs->vfs_readonly = B_TRUE; + vfs->vfs_do_readonly = B_TRUE; + } + + fstrans_cookie_t cookie = spl_fstrans_mark(); + err = zfs_domount(sb, fc->source, vfs, + fc->sb_flags & SB_SILENT ? 1 : 0); + spl_fstrans_unmark(cookie); + if (err) { - deactivate_locked_super(s); - return (ERR_PTR(err)); + deactivate_locked_super(sb); + return (-err); } - s->s_flags |= SB_ACTIVE; - } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) { + + /* + * zfsvfs has taken ownership of the mount options, so we + * need to ensure we don't free them. + */ + fc->fs_private = NULL; + + sb->s_flags |= SB_ACTIVE; + } else if (!issnap && ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)) { /* * Skip ro check for snap since snap is always ro regardless * ro flag is passed by mount or not. */ - deactivate_locked_super(s); - return (ERR_PTR(-EBUSY)); + deactivate_locked_super(sb); + return (-SET_ERROR(EBUSY)); } - return (s); + struct dentry *root = dget(sb->s_root); + if (IS_ERR(root)) + return (PTR_ERR(root)); + + fc->root = root; + return (0); } -static struct dentry * -zpl_mount(struct file_system_type *fs_type, int flags, - const char *osname, void *data) +static int +zpl_reconfigure(struct fs_context *fc) { - zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; + fstrans_cookie_t cookie; + int error; - struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); - if (IS_ERR(sb)) - return (ERR_CAST(sb)); + cookie = spl_fstrans_mark(); + error = -zfs_remount(fc->root->d_sb, fc->fs_private, fc->sb_flags); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); - return (dget(sb->s_root)); -} + if (error == 0) { + /* + * zfsvfs has taken ownership of the mount options, so we + * need to ensure we don't free them. + */ + fc->fs_private = NULL; + } -static void -zpl_kill_sb(struct super_block *sb) -{ - zfs_preumount(sb); - kill_anon_super(sb); + return (error); } -void -zpl_prune_sb(uint64_t nr_to_scan, void *arg) +static int +zpl_dup_fc(struct fs_context *fc, struct fs_context *src_fc) { - struct super_block *sb = (struct super_block *)arg; - int objects = 0; + vfs_t *src_vfs = src_fc->fs_private; + if (src_vfs == NULL) + return (0); + + vfs_t *vfs = zfsvfs_vfs_alloc(); + if (vfs == NULL) + return (-SET_ERROR(ENOMEM)); /* - * Ensure the superblock is not in the process of being torn down. + * This is annoying, but a straight memcpy() would require us to + * reinitialise the lock. */ -#ifdef HAVE_SB_DYING - if (down_read_trylock(&sb->s_umount)) { - if (!(sb->s_flags & SB_DYING) && sb->s_root && - (sb->s_flags & SB_BORN)) { - (void) zfs_prune(sb, nr_to_scan, &objects); - } - up_read(&sb->s_umount); - } -#else - if (down_read_trylock(&sb->s_umount)) { - if (!hlist_unhashed(&sb->s_instances) && - sb->s_root && (sb->s_flags & SB_BORN)) { - (void) zfs_prune(sb, nr_to_scan, &objects); - } - up_read(&sb->s_umount); - } + vfs->vfs_xattr = src_vfs->vfs_xattr; + vfs->vfs_readonly = src_vfs->vfs_readonly; + vfs->vfs_do_readonly = src_vfs->vfs_do_readonly; + vfs->vfs_setuid = src_vfs->vfs_setuid; + vfs->vfs_do_setuid = src_vfs->vfs_do_setuid; + vfs->vfs_exec = src_vfs->vfs_exec; + vfs->vfs_do_exec = src_vfs->vfs_do_exec; + vfs->vfs_devices = src_vfs->vfs_devices; + vfs->vfs_do_devices = src_vfs->vfs_do_devices; + vfs->vfs_do_xattr = src_vfs->vfs_do_xattr; + vfs->vfs_atime = src_vfs->vfs_atime; + vfs->vfs_do_atime = src_vfs->vfs_do_atime; + vfs->vfs_relatime = src_vfs->vfs_relatime; + vfs->vfs_do_relatime = src_vfs->vfs_do_relatime; + vfs->vfs_nbmand = src_vfs->vfs_nbmand; + vfs->vfs_do_nbmand = src_vfs->vfs_do_nbmand; + + mutex_enter(&src_vfs->vfs_mntpt_lock); + if (src_vfs->vfs_mntpoint != NULL) + vfs->vfs_mntpoint = kmem_strdup(src_vfs->vfs_mntpoint); + mutex_exit(&src_vfs->vfs_mntpt_lock); + + fc->fs_private = vfs; + return (0); +} + +static void +zpl_free_fc(struct fs_context *fc) +{ + zfsvfs_vfs_free(fc->fs_private); +} + +const struct fs_context_operations zpl_fs_context_operations = { +#ifdef HAVE_FORBIDDEN_SB_FLAGS + .parse_monolithic = zpl_parse_monolithic, #endif + .parse_param = zpl_parse_param, + .get_tree = zpl_get_tree, + .reconfigure = zpl_reconfigure, + .dup = zpl_dup_fc, + .free = zpl_free_fc, +}; + +static int +zpl_init_fs_context(struct fs_context *fc) +{ + fc->fs_private = zfsvfs_vfs_alloc(); + if (fc->fs_private == NULL) + return (-SET_ERROR(ENOMEM)); + + fc->ops = &zpl_fs_context_operations; + + return (0); } const struct super_operations zpl_super_operations = { @@ -517,7 +1019,6 @@ const struct super_operations zpl_super_operations = { .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, .statfs = zpl_statfs, - .remount_fs = zpl_remount_fs, .show_devname = zpl_show_devname, .show_options = zpl_show_options, .show_stats = NULL, @@ -560,7 +1061,7 @@ struct file_system_type zpl_fs_type = { #else .fs_flags = FS_USERNS_MOUNT, #endif - .mount = zpl_mount, + .init_fs_context = zpl_init_fs_context, .kill_sb = zpl_kill_sb, }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 89f9bc555fcf..dc47ff20fd74 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1796,7 +1796,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); - ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(RW_WRITE_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); |
